Merge github.com:explosion/spaCy into dutch

2016-12-13 09:25:23 +01:00 · 2016-12-13 09:25:23 +01:00 · 4a3fdcce8a
parent 88869e0e07 3b72fee624
commit 4a3fdcce8a
10 changed files with 143 additions and 19 deletions
--- a/README.rst
+++ b/README.rst
@ -6,7 +6,7 @@ Cython. spaCy is built on  the very latest research, but it isn't researchware.
 It was designed from day 1 to be used in real products. It's commercial 
 open-source software, released under the MIT license.

-💫 **Version 1.2 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_
+💫 **Version 1.3 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_

 .. image:: http://i.imgur.com/wFvLZyJ.png
    :target: https://travis-ci.org/explosion/spaCy
@ -241,8 +241,38 @@ calling ``spacy.load()``, or by passing a ``path`` argument to the ``spacy.en.En
 Changelog
 =========

-2016-11-04 `v1.2.0 <https://github.com/explosion/spaCy/releases>`_: *Alpha tokenizers for Chinese, French, Spanish, Italian and Portuguese*
-------------------------------------------------------------------------------------------------------------------------------------------
+2016-12-03 `v1.3.0 <https://github.com/explosion/spaCy/releases>`_: *Improve API consistency*
+---------------------------------------------------------------------------------------------
+
+**✨ API improvements**
+
+* Add ``Span.sentiment`` attribute.
+* `#658 <https://github.com/explosion/spaCy/pull/658>`_: Add ``Span.noun_chunks`` iterator (thanks `@pokey <https://github.com/pokey>`_).
+* `#642 <https://github.com/explosion/spaCy/pull/642>`_: Let ``--data-path`` be specified when running download.py scripts (thanks `@ExplodingCabbage <https://github.com/ExplodingCabbage>`_).
+* `#638 <https://github.com/explosion/spaCy/pull/638>`_: Add German stopwords (thanks `@souravsingh <https://github.com/souravsingh>`_).
+* `#614 <https://github.com/explosion/spaCy/pull/614>`_: Fix ``PhraseMatcher`` to work with new ``Matcher`` (thanks `@sadovnychyi <https://github.com/sadovnychyi>`_).
+
+**🔴 Bug fixes**
+
+* Fix issue `#605 <https://github.com/explosion/spaCy/issues/605>`_: ``accept`` argument to ``Matcher`` now rejects matches as expected.
+* Fix issue `#617 <https://github.com/explosion/spaCy/issues/617>`_: ``Vocab.load()`` now works with string paths, as well as ``Path`` objects.
+* Fix issue `#639 <https://github.com/explosion/spaCy/issues/639>`_: Stop words in ``Language`` class now used as expected.
+* Fix issues `#656 <https://github.com/explosion/spaCy/issues/656>`_, `#624 <https://github.com/explosion/spaCy/issues/624>`_: ``Tokenizer`` special-case rules now support arbitrary token attributes.
+
+
+**📖 Documentation and examples**
+
+* Add `"Customizing the tokenizer" <https://spacy.io/docs/usage/customizing-tokenizer>`_ workflow.
+* Add `"Training the tagger, parser and entity recognizer" <https://spacy.io/docs/usage/training>`_ workflow.
+* Add `"Entity recognition" <https://spacy.io/docs/usage/entity-recognition>`_ workflow.
+* Fix various typos and inconsistencies.
+
+**👥  Contributors**
+
+Thanks to `@pokey <https://github.com/pokey>`_, `@ExplodingCabbage <https://github.com/ExplodingCabbage>`_, `@souravsingh <https://github.com/souravsingh>`_, `@sadovnychyi <https://github.com/sadovnychyi>`_, `@manojsakhwar <https://github.com/manojsakhwar>`_, `@TiagoMRodrigues <https://github.com/TiagoMRodrigues>`_, `@savkov <https://github.com/savkov>`_, `@pspiegelhalter <https://github.com/pspiegelhalter>`_, `@chenb67 <https://github.com/chenb67>`_, `@kylepjohnson <https://github.com/kylepjohnson>`_, `@YanhaoYang <https://github.com/YanhaoYang>`_, `@tjrileywisc <https://github.com/tjrileywisc>`_, `@dechov <https://github.com/dechov>`_, `@wjt <https://github.com/wjt>`_, `@jsmootiv <https://github.com/jsmootiv>`_ and `@blarghmatey <https://github.com/blarghmatey>`_ for the pull requests!
+
+2016-11-04 `v1.2.0 <https://github.com/explosion/spaCy/releases/tag/v1.2.0>`_: *Alpha tokenizers for Chinese, French, Spanish, Italian and Portuguese*
+------------------------------------------------------------------------------------------------------------------------------------------------------

 **✨ Major features and improvements**

--- a/examples/training/load_ner.py
+++ b/examples/training/load_ner.py
@ -0,0 +1,22 @@
+# Load NER
+from __future__ import unicode_literals
+import spacy
+import pathlib
+from spacy.pipeline import EntityRecognizer
+from spacy.vocab import Vocab
+
+def load_model(model_dir):
+    model_dir = pathlib.Path(model_dir)
+    nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)
+    with (model_dir / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
+        nlp.vocab.strings.load(file_)
+    nlp.vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin')
+    ner = EntityRecognizer.load(model_dir, nlp.vocab, require=True)
+    return (nlp, ner)
+
+(nlp, ner) = load_model('ner')
+doc = nlp.make_doc('Who is Shaka Khan?')
+nlp.tagger(doc)
+ner(doc)
+for word in doc:
+    print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob)
--- a/examples/training/train_ner.py
+++ b/examples/training/train_ner.py
@ -10,6 +10,13 @@ from spacy.tagger import Tagger


 def train_ner(nlp, train_data, entity_types):
+    # Add new words to vocab.
+    for raw_text, _ in train_data:
+        doc = nlp.make_doc(raw_text)
+        for word in doc:
+            _ = nlp.vocab[word.orth]
+
+    # Train NER.
    ner = EntityRecognizer(nlp.vocab, entity_types=entity_types)
    for itn in range(5):
        random.shuffle(train_data)
@ -20,21 +27,30 @@ def train_ner(nlp, train_data, entity_types):
    ner.model.end_training()
    return ner

+def save_model(ner, model_dir):
+    model_dir = pathlib.Path(model_dir)
+    if not model_dir.exists():
+        model_dir.mkdir()
+    assert model_dir.is_dir()
+
+    with (model_dir / 'config.json').open('w') as file_:
+        json.dump(ner.cfg, file_)
+    ner.model.dump(str(model_dir / 'model'))
+    if not (model_dir / 'vocab').exists():
+        (model_dir / 'vocab').mkdir()
+    ner.vocab.dump(str(model_dir / 'vocab' / 'lexemes.bin'))
+    with (model_dir / 'vocab' / 'strings.json').open('w', encoding='utf8') as file_:
+        ner.vocab.strings.dump(file_)
+

 def main(model_dir=None):
-    if model_dir is not None:
-        model_dir = pathlib.Path(model_dir)
-        if not model_dir.exists():
-            model_dir.mkdir()
-        assert model_dir.is_dir()
-
    nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)

    # v1.1.2 onwards
    if nlp.tagger is None:
        print('---- WARNING ----')
        print('Data directory not found')
-        print('please run: `python -m spacy.en.download –force all` for better performance')
+        print('please run: `python -m spacy.en.download --force all` for better performance')
        print('Using feature templates for tagging')
        print('-----------------')
        nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates)
@ -56,16 +72,17 @@ def main(model_dir=None):
    nlp.tagger(doc)
    ner(doc)
    for word in doc:
-        print(word.text, word.tag_, word.ent_type_, word.ent_iob)
+        print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob)

    if model_dir is not None:
-        with (model_dir / 'config.json').open('w') as file_:
-            json.dump(ner.cfg, file_)
-        ner.model.dump(str(model_dir / 'model'))
+        save_model(ner, model_dir)
+
+
+


 if __name__ == '__main__':
-    main()
+    main('ner')
    # Who "" 2
    # is "" 2
    # Shaka "" PERSON 3
--- a/examples/training/train_tagger.py
+++ b/examples/training/train_tagger.py
@ -69,7 +69,7 @@ def main(output_dir=None):
        print(word.text, word.tag_, word.pos_)
    if output_dir is not None:
        tagger.model.dump(str(output_dir / 'pos' / 'model'))
-        with (output_dir / 'vocab' / 'strings.json').open('wb') as file_:
+        with (output_dir / 'vocab' / 'strings.json').open('w') as file_:
            tagger.vocab.strings.dump(file_)


--- a/spacy/about.py
+++ b/spacy/about.py
@ -4,7 +4,7 @@
 # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py

 __title__ = 'spacy'
-__version__ = '1.2.0'
+__version__ = '1.3.0'
 __summary__ = 'Industrial-strength NLP'
 __uri__ = 'https://spacy.io'
 __author__ = 'Matthew Honnibal'
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@ -426,3 +426,9 @@ cpdef enum symbol_t:
 #IS_QUOTE
 #IS_LEFT_PUNCT
 #IS_RIGHT_PUNCT
+
+# These symbols are currently missing. However, if we add them currently,
+# we'll throw off the integer index and the model will have to be retrained.
+# We therefore wait until the next data version to add them.
+# acl
+
--- a/spacy/tests/spans/test_span.py
+++ b/spacy/tests/spans/test_span.py
@ -1,6 +1,7 @@
 from __future__ import unicode_literals
 from spacy.attrs import HEAD
 from spacy.en import English
+from spacy.tokens.doc import Doc
 import numpy as np

 import pytest
@ -49,3 +50,44 @@ def test_sent(doc):
    assert span.sent.text == 'This is a sentence.'
    span = doc[6:7]
    assert span.sent.root.left_edge.text == 'This'
+
+
+def test_default_sentiment(EN):
+    '''Test new span.sentiment property's default averaging behaviour'''
+    good = EN.vocab[u'good']
+    good.sentiment = 3.0
+    bad = EN.vocab[u'bad']
+    bad.sentiment = -2.0
+
+    doc = Doc(EN.vocab, [u'good', 'stuff', u'bad', u'stuff'])
+
+    good_stuff = doc[:2]
+    assert good_stuff.sentiment == 3.0 / 2
+
+    bad_stuff = doc[-2:]
+    assert bad_stuff.sentiment == -2. / 2
+
+    good_stuff_bad = doc[:-1]
+    assert good_stuff_bad.sentiment == (3.+-2) / 3.
+
+
+
+def test_override_sentiment(EN):
+    '''Test new span.sentiment property's default averaging behaviour'''
+    good = EN.vocab[u'good']
+    good.sentiment = 3.0
+    bad = EN.vocab[u'bad']
+    bad.sentiment = -2.0
+
+    doc = Doc(EN.vocab, [u'good', 'stuff', u'bad', u'stuff'])
+
+    doc.user_span_hooks['sentiment'] = lambda span: 10.0
+
+    good_stuff = doc[:2]
+    assert good_stuff.sentiment == 10.0
+
+    bad_stuff = doc[-2:]
+    assert bad_stuff.sentiment == 10.0
+
+    good_stuff_bad = doc[:-1]
+    assert good_stuff_bad.sentiment == 10.0
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -179,6 +179,13 @@ cdef class Span:
                self._vector_norm = sqrt(norm) if norm != 0 else 0
            return self._vector_norm

+    property sentiment:
+        def __get__(self):
+            if 'sentiment' in self.doc.user_span_hooks:
+                return self.doc.user_span_hooks['sentiment'](self)
+            else:
+                return sum([token.sentiment for token in self]) / len(self)
+
    property text:
        def __get__(self):
            text = self.text_with_ws
--- a/website/docs/usage/data-model.jade
+++ b/website/docs/usage/data-model.jade
@ -14,7 +14,7 @@ p After reading this page, you should be able to:
 +h(3, "no-job-too-big") No job too big

 p
-    |  When writing spaCy, one of my motos was #[em no job too big]. I wanted
+    |  When writing spaCy, one of my mottos was #[em no job too big]. I wanted
    |  to make sure that if Google or Facebook were founded tomorrow, spaCy
    |  would be the obvious choice for them. I wanted spaCy to be the obvious
    |  choice for web-scale NLP. This meant sweating about performance, because
--- a/website/docs/usage/entity-recognition.jade
+++ b/website/docs/usage/entity-recognition.jade
@ -217,7 +217,7 @@ p
        ('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')])
    ]

-    nlp = spacy.load(entity=False, parser=False)
+    nlp = spacy.load('en', entity=False, parser=False)
    ner = EntityRecognizer(nlp.vocab, entity_types=['PERSON', 'LOC'])

    for itn in range(5):