From 789e1a39805b8a299e880e8c0a091782753c205a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 31 Aug 2017 14:13:00 -0500 Subject: [PATCH 01/17] Use 13 parser features, not 8 --- spacy/syntax/nn_parser.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 552ea4f8f..2aaae4f05 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -705,7 +705,7 @@ cdef class Parser: lower, stream, drop=dropout) return state2vec, upper - nr_feature = 8 + nr_feature = 13 def get_token_ids(self, states): cdef StateClass state From 644d6c9e1a763a421d436bb634da361e73c29ca9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 4 Sep 2017 15:17:44 +0200 Subject: [PATCH 02/17] Improve lemmatization tests, re #1296 --- spacy/tests/lang/en/test_lemmatizer.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/spacy/tests/lang/en/test_lemmatizer.py b/spacy/tests/lang/en/test_lemmatizer.py index d02ae1700..00f02ccb4 100644 --- a/spacy/tests/lang/en/test_lemmatizer.py +++ b/spacy/tests/lang/en/test_lemmatizer.py @@ -2,12 +2,18 @@ from __future__ import unicode_literals import pytest +from ....tokens.doc import Doc @pytest.fixture def en_lemmatizer(EN): return EN.Defaults.create_lemmatizer() +@pytest.mark.models('en') +def test_doc_lemmatization(EN): + doc = Doc(EN.vocab, words=['bleed']) + doc[0].tag_ = 'VBP' + assert doc[0].lemma_ == 'bleed' @pytest.mark.models('en') @pytest.mark.parametrize('text,lemmas', [("aardwolves", ["aardwolf"]), @@ -19,6 +25,16 @@ def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas): assert en_lemmatizer.noun(text) == set(lemmas) +@pytest.mark.models('en') +@pytest.mark.parametrize('text,lemmas', [("bleed", ["bleed"]), + ("feed", ["feed"]), + ("need", ["need"]), + ("ring", ["ring"]), + ("axes", ["axis", "axe", "ax"])]) +def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas): + assert en_lemmatizer.noun(text) == set(lemmas) + + @pytest.mark.xfail @pytest.mark.models('en') def test_en_lemmatizer_base_forms(en_lemmatizer): From b29e6bff46f4afd20f50fc24d4460ef8c260516a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 4 Sep 2017 15:18:10 +0200 Subject: [PATCH 03/17] Improve lemmatization rule for am|VBP --- spacy/lang/en/morph_rules.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/lang/en/morph_rules.py b/spacy/lang/en/morph_rules.py index 4e95dc747..fedb89700 100644 --- a/spacy/lang/en/morph_rules.py +++ b/spacy/lang/en/morph_rules.py @@ -59,7 +59,8 @@ MORPH_RULES = { "VBP": { "are": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"}, - "'re": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"} + "'re": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"}, + "am": {LEMMA: "be", "VerbForm": "Fin", "Person": "One", "Tense": "Pres", "Mood": "Ind"}, }, "VBD": { From bfddf50081557b8ad5815a60a3fd8625ee5fe728 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 4 Sep 2017 15:18:41 +0200 Subject: [PATCH 04/17] Fix #1296: Incorrect lemmatization of base form verbs --- spacy/lemmatizer.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index d7541c56b..4d534b50f 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -44,6 +44,11 @@ class Lemmatizer(object): return True elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf': return True + # This maps 'VBP' to base form -- probably just need 'IS_BASE' + # morphology + elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \ + morphology.get('Tense') == 'pres'): + return True elif univ_pos == 'adj' and morphology.get('Degree') == 'pos': return True elif VerbForm_inf in morphology: From 382ce566eb39a730440b891209df33b4d5a213cd Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 4 Sep 2017 15:19:01 +0200 Subject: [PATCH 05/17] Fix deserialization bug --- spacy/pipeline.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 238e5670e..cbecbe3b6 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -142,7 +142,7 @@ class BaseThincComponent(object): deserialize = OrderedDict(( ('cfg', lambda b: self.cfg.update(ujson.loads(b))), - ('model', lambda b: self.model.from_bytes(b)), + ('model', load_model), ('vocab', lambda b: self.vocab.from_bytes(b)) )) util.from_bytes(bytes_data, deserialize, exclude) From cb4839033c5bc5373d06aa0a3ed17280b8dd4526 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 4 Sep 2017 15:19:18 +0200 Subject: [PATCH 06/17] Fix loader for EN tests --- spacy/tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index e0e999f4c..f5d65803a 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -13,7 +13,7 @@ from .. import util _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'id', 'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx'] -_models = {'en': ['en_depent_web_sm', 'en_core_web_md'], +_models = {'en': ['en_core_web_sm'], 'de': ['de_core_news_md'], 'fr': ['fr_depvec_web_lg'], 'xx': ['xx_ent_web_md']} From 9f512e657a425af8b72795dd9d6a771ed1b9617d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 4 Sep 2017 09:26:38 -0500 Subject: [PATCH 07/17] Fix drop_layer calculation --- spacy/_ml.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 8adacdfda..f8a78948e 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -212,12 +212,14 @@ class PrecomputableMaxouts(Model): def drop_layer(layer, factor=2.): def drop_layer_fwd(X, drop=0.): - drop *= factor - mask = layer.ops.get_dropout_mask((1,), drop) - if mask is None or mask > 0: + if drop <= 0.: return layer.begin_update(X, drop=drop) else: - return X, lambda dX, sgd=None: dX + coinflip = layer.ops.xp.random.random() + if (coinflip / factor) >= drop: + return layer.begin_update(X, drop=drop) + else: + return X, lambda dX, sgd=None: dX model = wrap(drop_layer_fwd, layer) model.predict = layer @@ -362,6 +364,8 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.): def backward(d_output, sgd=None): return (tokens, d_output) return vectors, backward + + def fine_tune(embedding, combine=None): if combine is not None: raise NotImplementedError( From 7fdafcc4c409697af789debbb287afc45352af00 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 4 Sep 2017 16:38:49 +0200 Subject: [PATCH 08/17] Fix config loading in tagger --- spacy/pipeline.pyx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index cbecbe3b6..7e00a443d 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -417,7 +417,8 @@ class NeuralTagger(BaseThincComponent): def from_bytes(self, bytes_data, **exclude): def load_model(b): if self.model is True: - token_vector_width = util.env_opt('token_vector_width', 128) + token_vector_width = util.env_opt('token_vector_width', + self.cfg.get('token_vector_width', 128)) self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width) self.model.from_bytes(b) @@ -451,7 +452,8 @@ class NeuralTagger(BaseThincComponent): def from_disk(self, path, **exclude): def load_model(p): if self.model is True: - token_vector_width = util.env_opt('token_vector_width', 128) + token_vector_width = util.env_opt('token_vector_width', + self.cfg.get('token_vector_width', 128)) self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width) self.model.from_bytes(p.open('rb').read()) From d5fbf27335a9df265b8151425e88e311333f176c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 4 Sep 2017 16:45:11 +0200 Subject: [PATCH 09/17] Fix test --- spacy/tests/lang/en/test_tagger.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/tests/lang/en/test_tagger.py b/spacy/tests/lang/en/test_tagger.py index 47a093b99..37d43d0c7 100644 --- a/spacy/tests/lang/en/test_tagger.py +++ b/spacy/tests/lang/en/test_tagger.py @@ -25,7 +25,6 @@ def test_tag_names(EN): doc = EN(text, disable=['parser']) assert type(doc[2].pos) == int assert isinstance(doc[2].pos_, six.text_type) - assert type(doc[2].dep) == int assert isinstance(doc[2].dep_, six.text_type) assert doc[2].tag_ == u'NNS' From 9d65d6798570b9121d099de1b2a3f412fce6563b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 4 Sep 2017 16:46:22 +0200 Subject: [PATCH 10/17] Preserve model compatibility in parser, for now --- spacy/syntax/nn_parser.pyx | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 2aaae4f05..34e504da9 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -393,7 +393,8 @@ cdef class Parser: tokvecs = self.model[0].ops.flatten(tokvecses) if USE_FINE_TUNE: - tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses))) + # TODO: This is incorrect! Unhack when training next model + tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses))) nr_state = len(docs) nr_class = self.moves.n_moves @@ -531,8 +532,8 @@ cdef class Parser: docs = [docs] golds = [golds] if USE_FINE_TUNE: - tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop) - tokvecs = self.model[0].ops.flatten(tokvecs) + my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop) + tokvecs += self.model[0].ops.flatten(my_tokvecs) cuda_stream = get_cuda_stream() @@ -605,8 +606,8 @@ cdef class Parser: assert min(lengths) >= 1 tokvecs = self.model[0].ops.flatten(tokvecs) if USE_FINE_TUNE: - tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop) - tokvecs = self.model[0].ops.flatten(tokvecs) + my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop) + tokvecs += self.model[0].ops.flatten(my_tokvecs) states = self.moves.init_batch(docs) for gold in golds: From 66646ead261ec941fa2624a8fa53295973126433 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 4 Sep 2017 18:14:15 +0200 Subject: [PATCH 11/17] Update travis --- .travis.yml | 3 ++- travis.sh | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index a1f7044b0..fda38937d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,6 +14,7 @@ os: env: - VIA=compile LC_ALL=en_US.ascii - VIA=compile + - VIA=pypi # - VIA=sdist @@ -23,7 +24,7 @@ install: script: - "pip install pytest pytest-timeout" - if [[ "${VIA}" == "compile" ]]; then python -m pytest --tb=native spacy; fi - - if [[ "${VIA}" == "pypi" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(ospath.dirname(spacy.__file__)))"`; fi + - if [[ "${VIA}" == "pypi" ]]; then python -m pytest --tb=native --models --en `python -c "import os.path; import spacy; print(os.path.abspath(ospath.dirname(spacy.__file__)))"`; fi - if [[ "${VIA}" == "sdist" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi notifications: diff --git a/travis.sh b/travis.sh index 67d413a1b..49cc3f158 100755 --- a/travis.sh +++ b/travis.sh @@ -3,7 +3,7 @@ if [ "${VIA}" == "pypi" ]; then rm -rf * pip install spacy - python -m spacy.en.download + python -m spacy download en python -m spacy.de.download fi From d47af995613ff503a701a8425b25adb59bda6c4e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 4 Sep 2017 18:43:33 +0200 Subject: [PATCH 12/17] Update travis.yml --- .travis.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index fda38937d..8be9a35ca 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,7 +14,6 @@ os: env: - VIA=compile LC_ALL=en_US.ascii - VIA=compile - - VIA=pypi # - VIA=sdist From 3ba9994f1f7c986a6404fd63336514ff7aad5ee9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 4 Sep 2017 18:44:23 +0200 Subject: [PATCH 13/17] Update travis --- .travis.yml | 3 +-- travis.sh | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 8be9a35ca..5d15c33d9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,8 +14,7 @@ os: env: - VIA=compile LC_ALL=en_US.ascii - VIA=compile - -# - VIA=sdist + - VIA=pypi install: - "./travis.sh" diff --git a/travis.sh b/travis.sh index 49cc3f158..4ed998ec0 100755 --- a/travis.sh +++ b/travis.sh @@ -4,7 +4,6 @@ if [ "${VIA}" == "pypi" ]; then rm -rf * pip install spacy python -m spacy download en - python -m spacy.de.download fi if [[ "${VIA}" == "sdist" && "${TRAVIS_PULL_REQUEST}" == "false" ]]; then From d9c609c0f5c5e3585d1815363b40ec206d27fab3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 4 Sep 2017 19:01:38 +0200 Subject: [PATCH 14/17] Update travis --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 5d15c33d9..95685da29 100644 --- a/.travis.yml +++ b/.travis.yml @@ -22,7 +22,7 @@ install: script: - "pip install pytest pytest-timeout" - if [[ "${VIA}" == "compile" ]]; then python -m pytest --tb=native spacy; fi - - if [[ "${VIA}" == "pypi" ]]; then python -m pytest --tb=native --models --en `python -c "import os.path; import spacy; print(os.path.abspath(ospath.dirname(spacy.__file__)))"`; fi + - if [[ "${VIA}" == "pypi" ]]; then python -m pytest --tb=native --models --en `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi - if [[ "${VIA}" == "sdist" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi notifications: From 6bd0a0df9a0d517cc9d7df20385ec8c0bc09048c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 4 Sep 2017 19:49:35 +0200 Subject: [PATCH 15/17] Update travis --- .travis.yml | 4 ++-- travis.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 95685da29..f302d0f1b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,7 +14,7 @@ os: env: - VIA=compile LC_ALL=en_US.ascii - VIA=compile - - VIA=pypi + - VIA=pypi_nightly install: - "./travis.sh" @@ -22,7 +22,7 @@ install: script: - "pip install pytest pytest-timeout" - if [[ "${VIA}" == "compile" ]]; then python -m pytest --tb=native spacy; fi - - if [[ "${VIA}" == "pypi" ]]; then python -m pytest --tb=native --models --en `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi + - if [[ "${VIA}" == "pypi_nightly" ]]; then python -m pytest --tb=native --models --en `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi - if [[ "${VIA}" == "sdist" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi notifications: diff --git a/travis.sh b/travis.sh index 4ed998ec0..4b7d8017c 100755 --- a/travis.sh +++ b/travis.sh @@ -2,7 +2,7 @@ if [ "${VIA}" == "pypi" ]; then rm -rf * - pip install spacy + pip install spacy-nightly python -m spacy download en fi From 48f4abdcf2a51ce1cb9ec00cc6a866f5b7a6ec9d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 4 Sep 2017 20:05:37 +0200 Subject: [PATCH 16/17] Update travis, removing pypi build --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index f302d0f1b..b87ffbd06 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,7 +14,7 @@ os: env: - VIA=compile LC_ALL=en_US.ascii - VIA=compile - - VIA=pypi_nightly + #- VIA=pypi_nightly install: - "./travis.sh" From e88a42e4604cc056fd25b324d8b4393c796c5500 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 4 Sep 2017 21:14:39 +0200 Subject: [PATCH 17/17] Increment version --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 4ff279506..d566fbb1f 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -3,7 +3,7 @@ # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py __title__ = 'spacy-nightly' -__version__ = '2.0.0a12' +__version__ = '2.0.0a13' __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' __uri__ = 'https://spacy.io' __author__ = 'Explosion AI'