mirror of https://github.com/explosion/spaCy.git
Merge github.com:explosion/spaCy into dutch
This commit is contained in:
commit
4a3fdcce8a
36
README.rst
36
README.rst
|
@ -6,7 +6,7 @@ Cython. spaCy is built on the very latest research, but it isn't researchware.
|
|||
It was designed from day 1 to be used in real products. It's commercial
|
||||
open-source software, released under the MIT license.
|
||||
|
||||
💫 **Version 1.2 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_
|
||||
💫 **Version 1.3 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_
|
||||
|
||||
.. image:: http://i.imgur.com/wFvLZyJ.png
|
||||
:target: https://travis-ci.org/explosion/spaCy
|
||||
|
@ -241,8 +241,38 @@ calling ``spacy.load()``, or by passing a ``path`` argument to the ``spacy.en.En
|
|||
Changelog
|
||||
=========
|
||||
|
||||
2016-11-04 `v1.2.0 <https://github.com/explosion/spaCy/releases>`_: *Alpha tokenizers for Chinese, French, Spanish, Italian and Portuguese*
|
||||
-------------------------------------------------------------------------------------------------------------------------------------------
|
||||
2016-12-03 `v1.3.0 <https://github.com/explosion/spaCy/releases>`_: *Improve API consistency*
|
||||
---------------------------------------------------------------------------------------------
|
||||
|
||||
**✨ API improvements**
|
||||
|
||||
* Add ``Span.sentiment`` attribute.
|
||||
* `#658 <https://github.com/explosion/spaCy/pull/658>`_: Add ``Span.noun_chunks`` iterator (thanks `@pokey <https://github.com/pokey>`_).
|
||||
* `#642 <https://github.com/explosion/spaCy/pull/642>`_: Let ``--data-path`` be specified when running download.py scripts (thanks `@ExplodingCabbage <https://github.com/ExplodingCabbage>`_).
|
||||
* `#638 <https://github.com/explosion/spaCy/pull/638>`_: Add German stopwords (thanks `@souravsingh <https://github.com/souravsingh>`_).
|
||||
* `#614 <https://github.com/explosion/spaCy/pull/614>`_: Fix ``PhraseMatcher`` to work with new ``Matcher`` (thanks `@sadovnychyi <https://github.com/sadovnychyi>`_).
|
||||
|
||||
**🔴 Bug fixes**
|
||||
|
||||
* Fix issue `#605 <https://github.com/explosion/spaCy/issues/605>`_: ``accept`` argument to ``Matcher`` now rejects matches as expected.
|
||||
* Fix issue `#617 <https://github.com/explosion/spaCy/issues/617>`_: ``Vocab.load()`` now works with string paths, as well as ``Path`` objects.
|
||||
* Fix issue `#639 <https://github.com/explosion/spaCy/issues/639>`_: Stop words in ``Language`` class now used as expected.
|
||||
* Fix issues `#656 <https://github.com/explosion/spaCy/issues/656>`_, `#624 <https://github.com/explosion/spaCy/issues/624>`_: ``Tokenizer`` special-case rules now support arbitrary token attributes.
|
||||
|
||||
|
||||
**📖 Documentation and examples**
|
||||
|
||||
* Add `"Customizing the tokenizer" <https://spacy.io/docs/usage/customizing-tokenizer>`_ workflow.
|
||||
* Add `"Training the tagger, parser and entity recognizer" <https://spacy.io/docs/usage/training>`_ workflow.
|
||||
* Add `"Entity recognition" <https://spacy.io/docs/usage/entity-recognition>`_ workflow.
|
||||
* Fix various typos and inconsistencies.
|
||||
|
||||
**👥 Contributors**
|
||||
|
||||
Thanks to `@pokey <https://github.com/pokey>`_, `@ExplodingCabbage <https://github.com/ExplodingCabbage>`_, `@souravsingh <https://github.com/souravsingh>`_, `@sadovnychyi <https://github.com/sadovnychyi>`_, `@manojsakhwar <https://github.com/manojsakhwar>`_, `@TiagoMRodrigues <https://github.com/TiagoMRodrigues>`_, `@savkov <https://github.com/savkov>`_, `@pspiegelhalter <https://github.com/pspiegelhalter>`_, `@chenb67 <https://github.com/chenb67>`_, `@kylepjohnson <https://github.com/kylepjohnson>`_, `@YanhaoYang <https://github.com/YanhaoYang>`_, `@tjrileywisc <https://github.com/tjrileywisc>`_, `@dechov <https://github.com/dechov>`_, `@wjt <https://github.com/wjt>`_, `@jsmootiv <https://github.com/jsmootiv>`_ and `@blarghmatey <https://github.com/blarghmatey>`_ for the pull requests!
|
||||
|
||||
2016-11-04 `v1.2.0 <https://github.com/explosion/spaCy/releases/tag/v1.2.0>`_: *Alpha tokenizers for Chinese, French, Spanish, Italian and Portuguese*
|
||||
------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
**✨ Major features and improvements**
|
||||
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
# Load NER
|
||||
from __future__ import unicode_literals
|
||||
import spacy
|
||||
import pathlib
|
||||
from spacy.pipeline import EntityRecognizer
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
def load_model(model_dir):
|
||||
model_dir = pathlib.Path(model_dir)
|
||||
nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)
|
||||
with (model_dir / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
|
||||
nlp.vocab.strings.load(file_)
|
||||
nlp.vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin')
|
||||
ner = EntityRecognizer.load(model_dir, nlp.vocab, require=True)
|
||||
return (nlp, ner)
|
||||
|
||||
(nlp, ner) = load_model('ner')
|
||||
doc = nlp.make_doc('Who is Shaka Khan?')
|
||||
nlp.tagger(doc)
|
||||
ner(doc)
|
||||
for word in doc:
|
||||
print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob)
|
|
@ -10,6 +10,13 @@ from spacy.tagger import Tagger
|
|||
|
||||
|
||||
def train_ner(nlp, train_data, entity_types):
|
||||
# Add new words to vocab.
|
||||
for raw_text, _ in train_data:
|
||||
doc = nlp.make_doc(raw_text)
|
||||
for word in doc:
|
||||
_ = nlp.vocab[word.orth]
|
||||
|
||||
# Train NER.
|
||||
ner = EntityRecognizer(nlp.vocab, entity_types=entity_types)
|
||||
for itn in range(5):
|
||||
random.shuffle(train_data)
|
||||
|
@ -20,21 +27,30 @@ def train_ner(nlp, train_data, entity_types):
|
|||
ner.model.end_training()
|
||||
return ner
|
||||
|
||||
def save_model(ner, model_dir):
|
||||
model_dir = pathlib.Path(model_dir)
|
||||
if not model_dir.exists():
|
||||
model_dir.mkdir()
|
||||
assert model_dir.is_dir()
|
||||
|
||||
with (model_dir / 'config.json').open('w') as file_:
|
||||
json.dump(ner.cfg, file_)
|
||||
ner.model.dump(str(model_dir / 'model'))
|
||||
if not (model_dir / 'vocab').exists():
|
||||
(model_dir / 'vocab').mkdir()
|
||||
ner.vocab.dump(str(model_dir / 'vocab' / 'lexemes.bin'))
|
||||
with (model_dir / 'vocab' / 'strings.json').open('w', encoding='utf8') as file_:
|
||||
ner.vocab.strings.dump(file_)
|
||||
|
||||
|
||||
def main(model_dir=None):
|
||||
if model_dir is not None:
|
||||
model_dir = pathlib.Path(model_dir)
|
||||
if not model_dir.exists():
|
||||
model_dir.mkdir()
|
||||
assert model_dir.is_dir()
|
||||
|
||||
nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)
|
||||
|
||||
# v1.1.2 onwards
|
||||
if nlp.tagger is None:
|
||||
print('---- WARNING ----')
|
||||
print('Data directory not found')
|
||||
print('please run: `python -m spacy.en.download –force all` for better performance')
|
||||
print('please run: `python -m spacy.en.download --force all` for better performance')
|
||||
print('Using feature templates for tagging')
|
||||
print('-----------------')
|
||||
nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates)
|
||||
|
@ -56,16 +72,17 @@ def main(model_dir=None):
|
|||
nlp.tagger(doc)
|
||||
ner(doc)
|
||||
for word in doc:
|
||||
print(word.text, word.tag_, word.ent_type_, word.ent_iob)
|
||||
print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob)
|
||||
|
||||
if model_dir is not None:
|
||||
with (model_dir / 'config.json').open('w') as file_:
|
||||
json.dump(ner.cfg, file_)
|
||||
ner.model.dump(str(model_dir / 'model'))
|
||||
save_model(ner, model_dir)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
main('ner')
|
||||
# Who "" 2
|
||||
# is "" 2
|
||||
# Shaka "" PERSON 3
|
||||
|
|
|
@ -69,7 +69,7 @@ def main(output_dir=None):
|
|||
print(word.text, word.tag_, word.pos_)
|
||||
if output_dir is not None:
|
||||
tagger.model.dump(str(output_dir / 'pos' / 'model'))
|
||||
with (output_dir / 'vocab' / 'strings.json').open('wb') as file_:
|
||||
with (output_dir / 'vocab' / 'strings.json').open('w') as file_:
|
||||
tagger.vocab.strings.dump(file_)
|
||||
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
||||
|
||||
__title__ = 'spacy'
|
||||
__version__ = '1.2.0'
|
||||
__version__ = '1.3.0'
|
||||
__summary__ = 'Industrial-strength NLP'
|
||||
__uri__ = 'https://spacy.io'
|
||||
__author__ = 'Matthew Honnibal'
|
||||
|
|
|
@ -426,3 +426,9 @@ cpdef enum symbol_t:
|
|||
#IS_QUOTE
|
||||
#IS_LEFT_PUNCT
|
||||
#IS_RIGHT_PUNCT
|
||||
|
||||
# These symbols are currently missing. However, if we add them currently,
|
||||
# we'll throw off the integer index and the model will have to be retrained.
|
||||
# We therefore wait until the next data version to add them.
|
||||
# acl
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from __future__ import unicode_literals
|
||||
from spacy.attrs import HEAD
|
||||
from spacy.en import English
|
||||
from spacy.tokens.doc import Doc
|
||||
import numpy as np
|
||||
|
||||
import pytest
|
||||
|
@ -49,3 +50,44 @@ def test_sent(doc):
|
|||
assert span.sent.text == 'This is a sentence.'
|
||||
span = doc[6:7]
|
||||
assert span.sent.root.left_edge.text == 'This'
|
||||
|
||||
|
||||
def test_default_sentiment(EN):
|
||||
'''Test new span.sentiment property's default averaging behaviour'''
|
||||
good = EN.vocab[u'good']
|
||||
good.sentiment = 3.0
|
||||
bad = EN.vocab[u'bad']
|
||||
bad.sentiment = -2.0
|
||||
|
||||
doc = Doc(EN.vocab, [u'good', 'stuff', u'bad', u'stuff'])
|
||||
|
||||
good_stuff = doc[:2]
|
||||
assert good_stuff.sentiment == 3.0 / 2
|
||||
|
||||
bad_stuff = doc[-2:]
|
||||
assert bad_stuff.sentiment == -2. / 2
|
||||
|
||||
good_stuff_bad = doc[:-1]
|
||||
assert good_stuff_bad.sentiment == (3.+-2) / 3.
|
||||
|
||||
|
||||
|
||||
def test_override_sentiment(EN):
|
||||
'''Test new span.sentiment property's default averaging behaviour'''
|
||||
good = EN.vocab[u'good']
|
||||
good.sentiment = 3.0
|
||||
bad = EN.vocab[u'bad']
|
||||
bad.sentiment = -2.0
|
||||
|
||||
doc = Doc(EN.vocab, [u'good', 'stuff', u'bad', u'stuff'])
|
||||
|
||||
doc.user_span_hooks['sentiment'] = lambda span: 10.0
|
||||
|
||||
good_stuff = doc[:2]
|
||||
assert good_stuff.sentiment == 10.0
|
||||
|
||||
bad_stuff = doc[-2:]
|
||||
assert bad_stuff.sentiment == 10.0
|
||||
|
||||
good_stuff_bad = doc[:-1]
|
||||
assert good_stuff_bad.sentiment == 10.0
|
||||
|
|
|
@ -179,6 +179,13 @@ cdef class Span:
|
|||
self._vector_norm = sqrt(norm) if norm != 0 else 0
|
||||
return self._vector_norm
|
||||
|
||||
property sentiment:
|
||||
def __get__(self):
|
||||
if 'sentiment' in self.doc.user_span_hooks:
|
||||
return self.doc.user_span_hooks['sentiment'](self)
|
||||
else:
|
||||
return sum([token.sentiment for token in self]) / len(self)
|
||||
|
||||
property text:
|
||||
def __get__(self):
|
||||
text = self.text_with_ws
|
||||
|
|
|
@ -14,7 +14,7 @@ p After reading this page, you should be able to:
|
|||
+h(3, "no-job-too-big") No job too big
|
||||
|
||||
p
|
||||
| When writing spaCy, one of my motos was #[em no job too big]. I wanted
|
||||
| When writing spaCy, one of my mottos was #[em no job too big]. I wanted
|
||||
| to make sure that if Google or Facebook were founded tomorrow, spaCy
|
||||
| would be the obvious choice for them. I wanted spaCy to be the obvious
|
||||
| choice for web-scale NLP. This meant sweating about performance, because
|
||||
|
|
|
@ -217,7 +217,7 @@ p
|
|||
('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')])
|
||||
]
|
||||
|
||||
nlp = spacy.load(entity=False, parser=False)
|
||||
nlp = spacy.load('en', entity=False, parser=False)
|
||||
ner = EntityRecognizer(nlp.vocab, entity_types=['PERSON', 'LOC'])
|
||||
|
||||
for itn in range(5):
|
||||
|
|
Loading…
Reference in New Issue