diff --git a/README.rst b/README.rst index bac46dec4..671801061 100644 --- a/README.rst +++ b/README.rst @@ -6,7 +6,7 @@ Cython. spaCy is built on the very latest research, but it isn't researchware. It was designed from day 1 to be used in real products. It's commercial open-source software, released under the MIT license. -πŸ’« **Version 1.2 out now!** `Read the release notes here. `_ +πŸ’« **Version 1.3 out now!** `Read the release notes here. `_ .. image:: http://i.imgur.com/wFvLZyJ.png :target: https://travis-ci.org/explosion/spaCy @@ -241,8 +241,38 @@ calling ``spacy.load()``, or by passing a ``path`` argument to the ``spacy.en.En Changelog ========= -2016-11-04 `v1.2.0 `_: *Alpha tokenizers for Chinese, French, Spanish, Italian and Portuguese* -------------------------------------------------------------------------------------------------------------------------------------------- +2016-12-03 `v1.3.0 `_: *Improve API consistency* +--------------------------------------------------------------------------------------------- + +**✨ API improvements** + +* Add ``Span.sentiment`` attribute. +* `#658 `_: Add ``Span.noun_chunks`` iterator (thanks `@pokey `_). +* `#642 `_: Let ``--data-path`` be specified when running download.py scripts (thanks `@ExplodingCabbage `_). +* `#638 `_: Add German stopwords (thanks `@souravsingh `_). +* `#614 `_: Fix ``PhraseMatcher`` to work with new ``Matcher`` (thanks `@sadovnychyi `_). + +**πŸ”΄ Bug fixes** + +* Fix issue `#605 `_: ``accept`` argument to ``Matcher`` now rejects matches as expected. +* Fix issue `#617 `_: ``Vocab.load()`` now works with string paths, as well as ``Path`` objects. +* Fix issue `#639 `_: Stop words in ``Language`` class now used as expected. +* Fix issues `#656 `_, `#624 `_: ``Tokenizer`` special-case rules now support arbitrary token attributes. + + +**πŸ“– Documentation and examples** + +* Add `"Customizing the tokenizer" `_ workflow. +* Add `"Training the tagger, parser and entity recognizer" `_ workflow. +* Add `"Entity recognition" `_ workflow. +* Fix various typos and inconsistencies. + +**πŸ‘₯ Contributors** + +Thanks to `@pokey `_, `@ExplodingCabbage `_, `@souravsingh `_, `@sadovnychyi `_, `@manojsakhwar `_, `@TiagoMRodrigues `_, `@savkov `_, `@pspiegelhalter `_, `@chenb67 `_, `@kylepjohnson `_, `@YanhaoYang `_, `@tjrileywisc `_, `@dechov `_, `@wjt `_, `@jsmootiv `_ and `@blarghmatey `_ for the pull requests! + +2016-11-04 `v1.2.0 `_: *Alpha tokenizers for Chinese, French, Spanish, Italian and Portuguese* +------------------------------------------------------------------------------------------------------------------------------------------------------ **✨ Major features and improvements** diff --git a/examples/training/load_ner.py b/examples/training/load_ner.py new file mode 100644 index 000000000..bf81cee50 --- /dev/null +++ b/examples/training/load_ner.py @@ -0,0 +1,22 @@ +# Load NER +from __future__ import unicode_literals +import spacy +import pathlib +from spacy.pipeline import EntityRecognizer +from spacy.vocab import Vocab + +def load_model(model_dir): + model_dir = pathlib.Path(model_dir) + nlp = spacy.load('en', parser=False, entity=False, add_vectors=False) + with (model_dir / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_: + nlp.vocab.strings.load(file_) + nlp.vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin') + ner = EntityRecognizer.load(model_dir, nlp.vocab, require=True) + return (nlp, ner) + +(nlp, ner) = load_model('ner') +doc = nlp.make_doc('Who is Shaka Khan?') +nlp.tagger(doc) +ner(doc) +for word in doc: + print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob) diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py index 8c96dc0a4..220244b93 100644 --- a/examples/training/train_ner.py +++ b/examples/training/train_ner.py @@ -10,6 +10,13 @@ from spacy.tagger import Tagger def train_ner(nlp, train_data, entity_types): + # Add new words to vocab. + for raw_text, _ in train_data: + doc = nlp.make_doc(raw_text) + for word in doc: + _ = nlp.vocab[word.orth] + + # Train NER. ner = EntityRecognizer(nlp.vocab, entity_types=entity_types) for itn in range(5): random.shuffle(train_data) @@ -20,21 +27,30 @@ def train_ner(nlp, train_data, entity_types): ner.model.end_training() return ner +def save_model(ner, model_dir): + model_dir = pathlib.Path(model_dir) + if not model_dir.exists(): + model_dir.mkdir() + assert model_dir.is_dir() + + with (model_dir / 'config.json').open('w') as file_: + json.dump(ner.cfg, file_) + ner.model.dump(str(model_dir / 'model')) + if not (model_dir / 'vocab').exists(): + (model_dir / 'vocab').mkdir() + ner.vocab.dump(str(model_dir / 'vocab' / 'lexemes.bin')) + with (model_dir / 'vocab' / 'strings.json').open('w', encoding='utf8') as file_: + ner.vocab.strings.dump(file_) + def main(model_dir=None): - if model_dir is not None: - model_dir = pathlib.Path(model_dir) - if not model_dir.exists(): - model_dir.mkdir() - assert model_dir.is_dir() - nlp = spacy.load('en', parser=False, entity=False, add_vectors=False) # v1.1.2 onwards if nlp.tagger is None: print('---- WARNING ----') print('Data directory not found') - print('please run: `python -m spacy.en.download –force all` for better performance') + print('please run: `python -m spacy.en.download --force all` for better performance') print('Using feature templates for tagging') print('-----------------') nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates) @@ -56,16 +72,17 @@ def main(model_dir=None): nlp.tagger(doc) ner(doc) for word in doc: - print(word.text, word.tag_, word.ent_type_, word.ent_iob) + print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob) if model_dir is not None: - with (model_dir / 'config.json').open('w') as file_: - json.dump(ner.cfg, file_) - ner.model.dump(str(model_dir / 'model')) + save_model(ner, model_dir) + + + if __name__ == '__main__': - main() + main('ner') # Who "" 2 # is "" 2 # Shaka "" PERSON 3 diff --git a/examples/training/train_tagger.py b/examples/training/train_tagger.py index 6d8f66630..d5a519942 100644 --- a/examples/training/train_tagger.py +++ b/examples/training/train_tagger.py @@ -69,7 +69,7 @@ def main(output_dir=None): print(word.text, word.tag_, word.pos_) if output_dir is not None: tagger.model.dump(str(output_dir / 'pos' / 'model')) - with (output_dir / 'vocab' / 'strings.json').open('wb') as file_: + with (output_dir / 'vocab' / 'strings.json').open('w') as file_: tagger.vocab.strings.dump(file_) diff --git a/spacy/about.py b/spacy/about.py index 3d5909d2c..1336000f4 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -4,7 +4,7 @@ # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py __title__ = 'spacy' -__version__ = '1.2.0' +__version__ = '1.3.0' __summary__ = 'Industrial-strength NLP' __uri__ = 'https://spacy.io' __author__ = 'Matthew Honnibal' diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index 942d8aa9c..ca1d1ed79 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -426,3 +426,9 @@ cpdef enum symbol_t: #IS_QUOTE #IS_LEFT_PUNCT #IS_RIGHT_PUNCT + +# These symbols are currently missing. However, if we add them currently, +# we'll throw off the integer index and the model will have to be retrained. +# We therefore wait until the next data version to add them. +# acl + diff --git a/spacy/tests/spans/test_span.py b/spacy/tests/spans/test_span.py index 27a88a61b..aee869c5b 100644 --- a/spacy/tests/spans/test_span.py +++ b/spacy/tests/spans/test_span.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals from spacy.attrs import HEAD from spacy.en import English +from spacy.tokens.doc import Doc import numpy as np import pytest @@ -49,3 +50,44 @@ def test_sent(doc): assert span.sent.text == 'This is a sentence.' span = doc[6:7] assert span.sent.root.left_edge.text == 'This' + + +def test_default_sentiment(EN): + '''Test new span.sentiment property's default averaging behaviour''' + good = EN.vocab[u'good'] + good.sentiment = 3.0 + bad = EN.vocab[u'bad'] + bad.sentiment = -2.0 + + doc = Doc(EN.vocab, [u'good', 'stuff', u'bad', u'stuff']) + + good_stuff = doc[:2] + assert good_stuff.sentiment == 3.0 / 2 + + bad_stuff = doc[-2:] + assert bad_stuff.sentiment == -2. / 2 + + good_stuff_bad = doc[:-1] + assert good_stuff_bad.sentiment == (3.+-2) / 3. + + + +def test_override_sentiment(EN): + '''Test new span.sentiment property's default averaging behaviour''' + good = EN.vocab[u'good'] + good.sentiment = 3.0 + bad = EN.vocab[u'bad'] + bad.sentiment = -2.0 + + doc = Doc(EN.vocab, [u'good', 'stuff', u'bad', u'stuff']) + + doc.user_span_hooks['sentiment'] = lambda span: 10.0 + + good_stuff = doc[:2] + assert good_stuff.sentiment == 10.0 + + bad_stuff = doc[-2:] + assert bad_stuff.sentiment == 10.0 + + good_stuff_bad = doc[:-1] + assert good_stuff_bad.sentiment == 10.0 diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index a4f49555a..903ef26d1 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -179,6 +179,13 @@ cdef class Span: self._vector_norm = sqrt(norm) if norm != 0 else 0 return self._vector_norm + property sentiment: + def __get__(self): + if 'sentiment' in self.doc.user_span_hooks: + return self.doc.user_span_hooks['sentiment'](self) + else: + return sum([token.sentiment for token in self]) / len(self) + property text: def __get__(self): text = self.text_with_ws diff --git a/website/docs/usage/data-model.jade b/website/docs/usage/data-model.jade index 3275d878f..beac4b60b 100644 --- a/website/docs/usage/data-model.jade +++ b/website/docs/usage/data-model.jade @@ -14,7 +14,7 @@ p After reading this page, you should be able to: +h(3, "no-job-too-big") No job too big p - | When writing spaCy, one of my motos was #[em no job too big]. I wanted + | When writing spaCy, one of my mottos was #[em no job too big]. I wanted | to make sure that if Google or Facebook were founded tomorrow, spaCy | would be the obvious choice for them. I wanted spaCy to be the obvious | choice for web-scale NLP. This meant sweating about performance, because diff --git a/website/docs/usage/entity-recognition.jade b/website/docs/usage/entity-recognition.jade index 1b65f3331..ed29142f4 100644 --- a/website/docs/usage/entity-recognition.jade +++ b/website/docs/usage/entity-recognition.jade @@ -217,7 +217,7 @@ p ('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')]) ] - nlp = spacy.load(entity=False, parser=False) + nlp = spacy.load('en', entity=False, parser=False) ner = EntityRecognizer(nlp.vocab, entity_types=['PERSON', 'LOC']) for itn in range(5):