From e40465487c045bb19adf65d965e93b23a35e3b19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ole=20Henrik=20Skogstr=C3=B8m?= Date: Tue, 30 Jan 2018 15:44:29 +0100 Subject: [PATCH 01/11] Added french syntax iterator with explenation --- spacy/lang/nb/__init__.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py index 900e59626..b6ec65e1e 100644 --- a/spacy/lang/nb/__init__.py +++ b/spacy/lang/nb/__init__.py @@ -13,6 +13,12 @@ from ...language import Language from ...attrs import LANG, NORM from ...util import update_exc, add_lookups +# Borrowing french syntax parser because both languages use +# universal dependencies for tagging/parsing. +# Read here for more: +# https://github.com/explosion/spaCy/pull/1882#issuecomment-361409573 +from ..fr.syntax_iterators import SYNTAX_ITERATORS + class NorwegianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) @@ -22,6 +28,7 @@ class NorwegianDefaults(Language.Defaults): stop_words = STOP_WORDS tag_map = TAG_MAP lemma_lookup = LOOKUP + syntax_iterators = SYNTAX_ITERATORS class Norwegian(Language): From f4a7d1a423964876208f0caccf0ba7a19d4832a0 Mon Sep 17 00:00:00 2001 From: Motoki Wu Date: Tue, 30 Jan 2018 18:29:54 -0800 Subject: [PATCH 02/11] make to sure pass in **cfg to each component when training --- spacy/language.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/language.py b/spacy/language.py index ae62f918a..a2b945c49 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -461,7 +461,8 @@ class Language(object): if hasattr(proc, 'begin_training'): proc.begin_training(get_gold_tuples(), pipeline=self.pipeline, - sgd=self._optimizer) + sgd=self._optimizer, + **cfg) return self._optimizer def evaluate(self, docs_golds, verbose=False): From 54062b7326b998c0fe3015ae9d78816762d52c25 Mon Sep 17 00:00:00 2001 From: Motoki Wu Date: Tue, 30 Jan 2018 18:30:19 -0800 Subject: [PATCH 03/11] added tests for issue #1915 --- spacy/tests/regression/test_issue1915.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 spacy/tests/regression/test_issue1915.py diff --git a/spacy/tests/regression/test_issue1915.py b/spacy/tests/regression/test_issue1915.py new file mode 100644 index 000000000..23cf6dc73 --- /dev/null +++ b/spacy/tests/regression/test_issue1915.py @@ -0,0 +1,19 @@ +# coding: utf8 + +from __future__ import unicode_literals +from ...language import Language + + +def test_simple_ner(): + cfg = { + 'hidden_depth': 2, # should error out + } + + nlp = Language() + nlp.add_pipe(nlp.create_pipe('ner')) + nlp.get_pipe('ner').add_label('answer') + try: + nlp.begin_training(**cfg) + assert False # should error out + except ValueError: + assert True From 002ee80ddf1e3616e9d957abbdab76180d45aa27 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 2 Feb 2018 20:32:08 +0100 Subject: [PATCH 04/11] Add html5lib to setup.py to fix six error (see #1924) --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index a387c605c..7c26a7491 100755 --- a/setup.py +++ b/setup.py @@ -192,6 +192,7 @@ def setup_package(): 'thinc>=6.10.1,<6.11.0', 'plac<1.0.0,>=0.9.6', 'six', + 'html5lib==1.0b8', 'pathlib', 'ujson>=1.35', 'dill>=0.2,<0.3', From 9df9da34a3280664277f326da573993affbe7be6 Mon Sep 17 00:00:00 2001 From: Ali Zarezade Date: Sat, 3 Feb 2018 17:21:34 +0330 Subject: [PATCH 05/11] Fix init_model issue Fixing issue #1928 --- spacy/cli/init_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 6e3369f4d..99a6e87eb 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -36,7 +36,7 @@ def init_model(lang, output_dir, freqs_loc, clusters_loc=None, vectors_loc=None, vectors_loc = ensure_path(vectors_loc) probs, oov_prob = read_freqs(freqs_loc) - vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else None, None + vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else (None, None) clusters = read_clusters(clusters_loc) if clusters_loc else {} nlp = create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, prune_vectors) @@ -69,7 +69,7 @@ def create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, pru lex_added += 1 nlp.vocab.cfg.update({'oov_prob': oov_prob}) - if vectors_data: + if len(vectors_data): nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys) if prune_vectors >= 1: nlp.vocab.prune_vectors(prune_vectors) From 35272eade8b7088ce7159a67ae220891a05de886 Mon Sep 17 00:00:00 2001 From: sayf eddine hammemi Date: Sun, 4 Feb 2018 20:45:24 +0100 Subject: [PATCH 06/11] Accept contributer agreement. --- .github/CONTRIBUTOR_AGREEMENT.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/CONTRIBUTOR_AGREEMENT.md b/.github/CONTRIBUTOR_AGREEMENT.md index f34603065..a8c741ce1 100644 --- a/.github/CONTRIBUTOR_AGREEMENT.md +++ b/.github/CONTRIBUTOR_AGREEMENT.md @@ -87,11 +87,11 @@ U.S. Federal law. Any choice of law rules will not apply. 7. Please place an “x” on one of the applicable statement below. Please do NOT mark both statements: - * [ ] I am signing on behalf of myself as an individual and no other person + * [x] I am signing on behalf of myself as an individual and no other person or entity, including my employer, has or will have rights with respect to my contributions. - * [ ] I am signing on behalf of my employer or a legal entity and I have the + * [x] I am signing on behalf of my employer or a legal entity and I have the actual authority to contractually bind that entity. ## Contributor Details From 86e7727855047ad036f72fa7d30ecee65c5975c3 Mon Sep 17 00:00:00 2001 From: sayf eddine hammemi Date: Sun, 4 Feb 2018 20:36:32 +0100 Subject: [PATCH 07/11] Fix typo in the word build. --- website/usage/_install/_instructions.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/usage/_install/_instructions.jade b/website/usage/_install/_instructions.jade index aeab67d2f..677c9d176 100644 --- a/website/usage/_install/_instructions.jade +++ b/website/usage/_install/_instructions.jade @@ -185,7 +185,7 @@ p p | Install a version of the - | #[+a("http://landinghub.visualstudio.com/visual-cpp-build-tools") Visual C++ Bulild Tools] or + | #[+a("http://landinghub.visualstudio.com/visual-cpp-build-tools") Visual C++ Build Tools] or | #[+a("https://www.visualstudio.com/vs/visual-studio-express/") Visual Studio Express] | that matches the version that was used to compile your Python | interpreter. For official distributions these are: From 251a7805fe1d64f4c7c3890648b46776c5c6a5b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ole=20Henrik=20Skogstr=C3=B8m?= Date: Mon, 5 Feb 2018 14:45:05 +0100 Subject: [PATCH 08/11] Copied French syntax iterator to simplify future changes --- spacy/lang/nb/__init__.py | 2 +- spacy/lang/nb/syntax_iterators.py | 42 +++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 spacy/lang/nb/syntax_iterators.py diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py index b6ec65e1e..629f4d6d8 100644 --- a/spacy/lang/nb/__init__.py +++ b/spacy/lang/nb/__init__.py @@ -17,7 +17,7 @@ from ...util import update_exc, add_lookups # universal dependencies for tagging/parsing. # Read here for more: # https://github.com/explosion/spaCy/pull/1882#issuecomment-361409573 -from ..fr.syntax_iterators import SYNTAX_ITERATORS +from .syntax_iterators import SYNTAX_ITERATORS class NorwegianDefaults(Language.Defaults): diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py new file mode 100644 index 000000000..c9de4f084 --- /dev/null +++ b/spacy/lang/nb/syntax_iterators.py @@ -0,0 +1,42 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...symbols import NOUN, PROPN, PRON + + +def noun_chunks(obj): + """ + Detect base noun phrases from a dependency parse. Works on both Doc and Span. + """ + labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss'] + doc = obj.doc # Ensure works on both Doc and Span. + np_deps = [doc.vocab.strings[label] for label in labels] + conj = doc.vocab.strings.add('conj') + np_label = doc.vocab.strings.add('NP') + seen = set() + for i, word in enumerate(obj): + if word.pos not in (NOUN, PROPN, PRON): + continue + # Prevent nested chunks from being produced + if word.i in seen: + continue + if word.dep in np_deps: + if any(w.i in seen for w in word.subtree): + continue + seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1)) + yield word.left_edge.i, word.right_edge.i+1, np_label + elif word.dep == conj: + head = word.head + while head.dep == conj and head.head.i < head.i: + head = head.head + # If the head is an NP, and we're coordinated to it, we're an NP + if head.dep in np_deps: + if any(w.i in seen for w in word.subtree): + continue + seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1)) + yield word.left_edge.i, word.right_edge.i+1, np_label + + +SYNTAX_ITERATORS = { + 'noun_chunks': noun_chunks +} From 697b60fbab85bf5064c50cb8755ff9d39ab9eb0f Mon Sep 17 00:00:00 2001 From: Ryan Matthews Date: Mon, 5 Feb 2018 16:17:54 -0500 Subject: [PATCH 09/11] Typo correction in Readme file Missing the "sm" in the en_core_web_....load() call. --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index a47084254..a50d9bd70 100644 --- a/README.rst +++ b/README.rst @@ -218,7 +218,7 @@ then call its ``load()`` method: import spacy import en_core_web_sm - nlp = en_core_web_.load() + nlp = en_core_web_sm.load() doc = nlp(u'This is a sentence.') 📖 **For more info and examples, check out the** From 58eb178667d9319dddaf9990ea519375cc407dca Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 7 Feb 2018 01:08:30 +0100 Subject: [PATCH 10/11] Update Doc.char_span docs [ci skip] --- website/api/doc.jade | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/website/api/doc.jade b/website/api/doc.jade index fd2fc34ef..7dc5e9842 100644 --- a/website/api/doc.jade +++ b/website/api/doc.jade @@ -245,7 +245,9 @@ p Check whether an extension has been registered on the #[code Doc] class. +tag method +tag-new(2) -p Create a #[code Span] object from the slice #[code doc.text[start : end]]. +p + | Create a #[code Span] object from the slice #[code doc.text[start : end]]. + | Returns #[code None] if the character indices don't map to a valid span. +aside-code("Example"). doc = nlp(u'I like New York') @@ -276,7 +278,7 @@ p Create a #[code Span] object from the slice #[code doc.text[start : end]]. +row("foot") +cell returns +cell #[code Span] - +cell The newly constructed object. + +cell The newly constructed object or #[code None]. +h(2, "similarity") Doc.similarity +tag method From f377c483e4caad212187e516d8aa121d8f957b50 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 7 Feb 2018 01:08:42 +0100 Subject: [PATCH 11/11] Add note on manual entity order in displaCy [ci skip] --- website/usage/_visualizers/_html.jade | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/website/usage/_visualizers/_html.jade b/website/usage/_visualizers/_html.jade index 648a6de80..e95b3527a 100644 --- a/website/usage/_visualizers/_html.jade +++ b/website/usage/_visualizers/_html.jade @@ -74,7 +74,8 @@ p | #[+a("https://github.com/tensorflow/models/tree/master/research/syntaxnet") SyntaxNet]. | Simply convert the dependency parse or recognised entities to displaCy's | format and set #[code manual=True] on either #[code render()] or - | #[code serve()]. + | #[code serve()]. When setting #[code ents] manually, make sure to supply + | them in the right order, i.e. starting with the lowest start position. +aside-code("Example"). ex = [{'text': 'But Google is starting from behind.',