Merge branch 'master' into develop

2019-09-11 11:52:38 +02:00 · 2019-09-11 11:52:38 +02:00 · e82a8d0d7a
parent 7b858ba606 8f9f48b04c
commit e82a8d0d7a
14 changed files with 1710 additions and 2132 deletions
--- a/spacy/cli/converters/iob2json.py
+++ b/spacy/cli/converters/iob2json.py
@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals

-import re
 from wasabi import Printer

 from ...gold import iob_to_biluo
--- a/spacy/lang/el/lemmatizer/init.py
+++ b/spacy/lang/el/lemmatizer/init.py
@ -46,6 +46,11 @@ class GreekLemmatizer(object):
        )
        return lemmas

+    def lookup(self, string):
+        if string in self.lookup_table:
+            return self.lookup_table[string]
+        return string
+

 def lemmatize(string, index, exceptions, rules):
    string = string.lower()
--- a/spacy/lang/pt/init.py
+++ b/spacy/lang/pt/init.py
@ -1,8 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals

-from pathlib import Path
-
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
--- a/spacy/lang/ro/tag_map.py
+++ b/spacy/lang/ro/tag_map.py
--- a/spacy/lang/sr/tokenizer_exceptions.py
+++ b/spacy/lang/sr/tokenizer_exceptions.py
@ -15,7 +15,6 @@ _abbrev_exc = [
    {ORTH: "пет", LEMMA: "петак", NORM: "петак"},
    {ORTH: "суб", LEMMA: "субота", NORM: "субота"},
    {ORTH: "нед", LEMMA: "недеља", NORM: "недеља"},
-
    # Months abbreviations
    {ORTH: "јан", LEMMA: "јануар", NORM: "јануар"},
    {ORTH: "феб", LEMMA: "фебруар", NORM: "фебруар"},
@ -28,7 +27,7 @@ _abbrev_exc = [
    {ORTH: "септ", LEMMA: "септембар", NORM: "септембар"},
    {ORTH: "окт", LEMMA: "октобар", NORM: "октобар"},
    {ORTH: "нов", LEMMA: "новембар", NORM: "новембар"},
-    {ORTH: "дец", LEMMA: "децембар", NORM: "децембар"}
+    {ORTH: "дец", LEMMA: "децембар", NORM: "децембар"},
 ]


--- a/spacy/tests/doc/test_retokenize_merge.py
+++ b/spacy/tests/doc/test_retokenize_merge.py
@ -103,7 +103,13 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
    text = "The players start."
    heads = [1, 1, 0, -1]
    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=["DT", "NN", "VBZ", "."], pos=["DET", "NOUN", "VERB", "PUNCT"], heads=heads)
+    doc = get_doc(
+        tokens.vocab,
+        words=[t.text for t in tokens],
+        tags=["DT", "NN", "VBZ", "."],
+        pos=["DET", "NOUN", "VERB", "PUNCT"],
+        heads=heads,
+    )
    assert len(doc) == 4
    assert doc[0].text == "The"
    assert doc[0].tag_ == "DT"
@ -115,7 +121,13 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
    assert doc[0].tag_ == "NN"
    assert doc[0].pos_ == "NOUN"
    assert doc[0].lemma_ == "The players"
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=["DT", "NN", "VBZ", "."], pos=["DET", "NOUN", "VERB", "PUNCT"], heads=heads)
+    doc = get_doc(
+        tokens.vocab,
+        words=[t.text for t in tokens],
+        tags=["DT", "NN", "VBZ", "."],
+        pos=["DET", "NOUN", "VERB", "PUNCT"],
+        heads=heads,
+    )
    assert len(doc) == 4
    assert doc[0].text == "The"
    assert doc[0].tag_ == "DT"
@ -269,18 +281,15 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):

    # if there is a parse, span.root provides default values
    words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
-    heads = [ 0,  -1,   1,  -3,  -4,  -5,  -1,  -7,  -8 ]
-    ents =  [
-        (3, 5, "ent-de"),
-        (5, 7, "ent-fg"),
-    ]
-    deps =  ["dep"] * len(words)
+    heads = [0, -1, 1, -3, -4, -5, -1, -7, -8]
+    ents = [(3, 5, "ent-de"), (5, 7, "ent-fg")]
+    deps = ["dep"] * len(words)
    en_vocab.strings.add("ent-de")
    en_vocab.strings.add("ent-fg")
    en_vocab.strings.add("dep")
    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
-    assert doc[2:4].root == doc[3] # root of 'c d' is d
-    assert doc[4:6].root == doc[4] # root is 'e f' is e
+    assert doc[2:4].root == doc[3]  # root of 'c d' is d
+    assert doc[4:6].root == doc[4]  # root is 'e f' is e
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[2:4])
        retokenizer.merge(doc[4:6])
@ -295,12 +304,9 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):

    # check that B is preserved if span[start] is B
    words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
-    heads = [ 0,  -1,   1,   1,  -4,  -5,  -1,  -7,  -8 ]
-    ents =  [
-        (3, 5, "ent-de"),
-        (5, 7, "ent-de"),
-    ]
-    deps =  ["dep"] * len(words)
+    heads = [0, -1, 1, 1, -4, -5, -1, -7, -8]
+    ents = [(3, 5, "ent-de"), (5, 7, "ent-de")]
+    deps = ["dep"] * len(words)
    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[3:5])
--- a/spacy/tests/regression/test_issue1001-1500.py
+++ b/spacy/tests/regression/test_issue1001-1500.py
@ -14,24 +14,24 @@ from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part


 def test_issue1061():
-    '''Test special-case works after tokenizing. Was caching problem.'''
-    text = 'I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_.'
+    """Test special-case works after tokenizing. Was caching problem."""
+    text = "I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_."
    tokenizer = English.Defaults.create_tokenizer()
    doc = tokenizer(text)
-    assert 'MATH' in [w.text for w in doc]
-    assert '_MATH_' not in [w.text for w in doc]
+    assert "MATH" in [w.text for w in doc]
+    assert "_MATH_" not in [w.text for w in doc]

-    tokenizer.add_special_case('_MATH_', [{ORTH: '_MATH_'}])
+    tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}])
    doc = tokenizer(text)
-    assert '_MATH_' in [w.text for w in doc]
-    assert 'MATH' not in [w.text for w in doc]
+    assert "_MATH_" in [w.text for w in doc]
+    assert "MATH" not in [w.text for w in doc]

    # For sanity, check it works when pipeline is clean.
    tokenizer = English.Defaults.create_tokenizer()
-    tokenizer.add_special_case('_MATH_', [{ORTH: '_MATH_'}])
+    tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}])
    doc = tokenizer(text)
-    assert '_MATH_' in [w.text for w in doc]
-    assert 'MATH' not in [w.text for w in doc]
+    assert "_MATH_" in [w.text for w in doc]
+    assert "MATH" not in [w.text for w in doc]


@pytest.mark.xfail(
--- a/spacy/tests/regression/test_issue3879.py
+++ b/spacy/tests/regression/test_issue3879.py
@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals

-import pytest
 from spacy.matcher import Matcher
 from spacy.tokens import Doc

--- a/spacy/tests/regression/test_issue3951.py
+++ b/spacy/tests/regression/test_issue3951.py
@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals

-import pytest
 from spacy.matcher import Matcher
 from spacy.tokens import Doc

--- a/spacy/tests/regression/test_issue3972.py
+++ b/spacy/tests/regression/test_issue3972.py
@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals

-import pytest
 from spacy.matcher import PhraseMatcher
 from spacy.tokens import Doc

--- a/spacy/tests/regression/test_issue4120.py
+++ b/spacy/tests/regression/test_issue4120.py
@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals

-import pytest
 from spacy.matcher import Matcher
 from spacy.tokens import Doc

--- a/spacy/tests/regression/test_issue4190.py
+++ b/spacy/tests/regression/test_issue4190.py
@ -2,44 +2,37 @@
 from __future__ import unicode_literals

 from spacy.lang.en import English
-
-import spacy
 from spacy.tokenizer import Tokenizer
+from spacy import util

-from spacy.tests.util import make_tempdir
+from ..util import make_tempdir


 def test_issue4190():
    test_string = "Test c."
-
    # Load default language
    nlp_1 = English()
    doc_1a = nlp_1(test_string)
-    result_1a = [token.text for token in doc_1a]
-
+    result_1a = [token.text for token in doc_1a]  # noqa: F841
    # Modify tokenizer
    customize_tokenizer(nlp_1)
    doc_1b = nlp_1(test_string)
    result_1b = [token.text for token in doc_1b]
-
    # Save and Reload
    with make_tempdir() as model_dir:
        nlp_1.to_disk(model_dir)
-        nlp_2 = spacy.load(model_dir)
-
+        nlp_2 = util.load_model(model_dir)
    # This should be the modified tokenizer
    doc_2 = nlp_2(test_string)
    result_2 = [token.text for token in doc_2]
-
    assert result_1b == result_2


 def customize_tokenizer(nlp):
-    prefix_re = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes)
-    suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes)
-    infix_re = spacy.util.compile_infix_regex(nlp.Defaults.infixes)
-
-    # remove all exceptions where a single letter is followed by a period (e.g. 'h.')
+    prefix_re = util.compile_prefix_regex(nlp.Defaults.prefixes)
+    suffix_re = util.compile_suffix_regex(nlp.Defaults.suffixes)
+    infix_re = util.compile_infix_regex(nlp.Defaults.infixes)
+    # Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
    exceptions = {
        k: v
        for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
@ -53,5 +46,4 @@ def customize_tokenizer(nlp):
        infix_finditer=infix_re.finditer,
        token_match=nlp.tokenizer.token_match,
    )
-
    nlp.tokenizer = new_tokenizer
--- a/spacy/tests/regression/test_issue4272.py
+++ b/spacy/tests/regression/test_issue4272.py
@ -0,0 +1,12 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from spacy.lang.el import Greek
+
+
+def test_issue4272():
+    """Test that lookup table can be accessed from Token.lemma if no POS tags
+    are available."""
+    nlp = Greek()
+    doc = nlp("Χθες")
+    assert doc[0].lemma_
--- a/spacy/tests/vocab_vectors/test_lookups.py
+++ b/spacy/tests/vocab_vectors/test_lookups.py
@ -56,6 +56,7 @@ def test_lookups_to_from_bytes():
    assert table2.get("b") == 2
    assert new_lookups.to_bytes() == lookups_bytes

+
 # This fails on Python 3.5
@pytest.mark.xfail
 def test_lookups_to_from_disk():
@ -76,6 +77,7 @@ def test_lookups_to_from_disk():
    assert len(table2) == 3
    assert table2.get("b") == 2

+
 # This fails on Python 3.5
@pytest.mark.xfail
 def test_lookups_to_from_bytes_via_vocab():