Merge branch 'master' of https://github.com/explosion/spaCy

2018-02-07 01:29:39 +01:00 · 2018-02-07 01:29:39 +01:00 · c087a14380
parent 76d89b2180 f377c483e4
commit c087a14380
11 changed files with 83 additions and 10 deletions
--- a/.github/CONTRIBUTOR_AGREEMENT.md
+++ b/.github/CONTRIBUTOR_AGREEMENT.md
@ -87,11 +87,11 @@ U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:

-    * [ ] I am signing on behalf of myself as an individual and no other person
+    * [x] I am signing on behalf of myself as an individual and no other person
    or entity, including my employer, has or will have rights with respect to my
    contributions.

-    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    * [x] I am signing on behalf of my employer or a legal entity and I have the
    actual authority to contractually bind that entity.

 ## Contributor Details
--- a/README.rst
+++ b/README.rst
@ -218,7 +218,7 @@ then call its ``load()`` method:
    import spacy
    import en_core_web_sm

-    nlp = en_core_web_.load()
+    nlp = en_core_web_sm.load()
    doc = nlp(u'This is a sentence.')

 📖 **For more info and examples, check out the**
--- a/setup.py
+++ b/setup.py
@ -192,6 +192,7 @@ def setup_package():
                'thinc>=6.10.1,<6.11.0',
                'plac<1.0.0,>=0.9.6',
                'six',
+                'html5lib==1.0b8',
                'pathlib',
                'ujson>=1.35',
                'dill>=0.2,<0.3',
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@ -36,7 +36,7 @@ def init_model(lang, output_dir, freqs_loc, clusters_loc=None, vectors_loc=None,
    vectors_loc = ensure_path(vectors_loc)

    probs, oov_prob = read_freqs(freqs_loc)
-    vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else None, None
+    vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else (None, None)
    clusters = read_clusters(clusters_loc) if clusters_loc else {}

    nlp = create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, prune_vectors)
@ -69,7 +69,7 @@ def create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, pru
        lex_added += 1
    nlp.vocab.cfg.update({'oov_prob': oov_prob})

-    if vectors_data:
+    if len(vectors_data):
        nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
    if prune_vectors >= 1:
        nlp.vocab.prune_vectors(prune_vectors)
--- a/spacy/lang/nb/init.py
+++ b/spacy/lang/nb/init.py
@ -13,6 +13,12 @@ from ...language import Language
 from ...attrs import LANG, NORM
 from ...util import update_exc, add_lookups

+# Borrowing french syntax parser because both languages use
+# universal dependencies for tagging/parsing.
+# Read here for more:
+# https://github.com/explosion/spaCy/pull/1882#issuecomment-361409573
+from .syntax_iterators import SYNTAX_ITERATORS
+

 class NorwegianDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
@ -22,6 +28,7 @@ class NorwegianDefaults(Language.Defaults):
    stop_words = STOP_WORDS
    tag_map = TAG_MAP
    lemma_lookup = LOOKUP
+    syntax_iterators = SYNTAX_ITERATORS


 class Norwegian(Language):
--- a/spacy/lang/nb/syntax_iterators.py
+++ b/spacy/lang/nb/syntax_iterators.py
@ -0,0 +1,42 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...symbols import NOUN, PROPN, PRON
+
+
+def noun_chunks(obj):
+    """
+    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
+    """
+    labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
+    doc = obj.doc  # Ensure works on both Doc and Span.
+    np_deps = [doc.vocab.strings[label] for label in labels]
+    conj = doc.vocab.strings.add('conj')
+    np_label = doc.vocab.strings.add('NP')
+    seen = set()
+    for i, word in enumerate(obj):
+        if word.pos not in (NOUN, PROPN, PRON):
+            continue
+        # Prevent nested chunks from being produced
+        if word.i in seen:
+            continue
+        if word.dep in np_deps:
+            if any(w.i in seen for w in word.subtree):
+                continue
+            seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
+            yield word.left_edge.i, word.right_edge.i+1, np_label
+        elif word.dep == conj:
+            head = word.head
+            while head.dep == conj and head.head.i < head.i:
+                head = head.head
+            # If the head is an NP, and we're coordinated to it, we're an NP
+            if head.dep in np_deps:
+                if any(w.i in seen for w in word.subtree):
+                    continue
+                seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
+                yield word.left_edge.i, word.right_edge.i+1, np_label
+
+
+SYNTAX_ITERATORS = {
+    'noun_chunks': noun_chunks
+}
--- a/spacy/language.py
+++ b/spacy/language.py
@ -461,7 +461,8 @@ class Language(object):
            if hasattr(proc, 'begin_training'):
                proc.begin_training(get_gold_tuples(),
                                    pipeline=self.pipeline,
-                                    sgd=self._optimizer)
+                                    sgd=self._optimizer,
+                                    **cfg)
        return self._optimizer

    def evaluate(self, docs_golds, verbose=False):
--- a/spacy/tests/regression/test_issue1915.py
+++ b/spacy/tests/regression/test_issue1915.py
@ -0,0 +1,19 @@
+# coding: utf8
+
+from __future__ import unicode_literals
+from ...language import Language
+
+
+def test_simple_ner():
+    cfg = {
+        'hidden_depth': 2,  # should error out
+    }
+
+    nlp = Language()
+    nlp.add_pipe(nlp.create_pipe('ner'))
+    nlp.get_pipe('ner').add_label('answer')
+    try:
+        nlp.begin_training(**cfg)
+        assert False  # should error out
+    except ValueError:
+        assert True
--- a/website/api/doc.jade
+++ b/website/api/doc.jade
@ -245,7 +245,9 @@ p Check whether an extension has been registered on the #[code Doc] class.
    +tag method
    +tag-new(2)

-p Create a #[code Span] object from the slice #[code doc.text[start : end]].
+p
+    |  Create a #[code Span] object from the slice #[code doc.text[start : end]].
+    |  Returns #[code None] if the character indices don't map to a valid span.

 +aside-code("Example").
    doc = nlp(u'I like New York')
@ -276,7 +278,7 @@ p Create a #[code Span] object from the slice #[code doc.text[start : end]].
    +row("foot")
        +cell returns
        +cell #[code Span]
-        +cell The newly constructed object.
+        +cell The newly constructed object or #[code None].

 +h(2, "similarity") Doc.similarity
    +tag method
--- a/website/usage/_install/_instructions.jade
+++ b/website/usage/_install/_instructions.jade
@ -185,7 +185,7 @@ p

 p
    |  Install a version of the
-    |  #[+a("http://landinghub.visualstudio.com/visual-cpp-build-tools") Visual C++ Bulild Tools] or
+    |  #[+a("http://landinghub.visualstudio.com/visual-cpp-build-tools") Visual C++ Build Tools] or
    |  #[+a("https://www.visualstudio.com/vs/visual-studio-express/") Visual Studio Express]
    |  that matches the version that was used to compile your Python
    |  interpreter. For official distributions these are:
--- a/website/usage/_visualizers/_html.jade
+++ b/website/usage/_visualizers/_html.jade
@ -74,7 +74,8 @@ p
    |  #[+a("https://github.com/tensorflow/models/tree/master/research/syntaxnet") SyntaxNet].
    |  Simply convert the dependency parse or recognised entities to displaCy's
    |  format and set #[code manual=True] on either #[code render()] or
-    |  #[code serve()].
+    |  #[code serve()]. When setting #[code ents] manually, make sure to supply
+    |  them in the right order, i.e. starting with the lowest start position.

 +aside-code("Example").
    ex = [{'text': 'But Google is starting from behind.',