Merge branch 'master' into develop

This commit is contained in:
Ines Montani 2019-09-11 11:52:38 +02:00
commit e82a8d0d7a
14 changed files with 1710 additions and 2132 deletions

View File

@ -1,7 +1,6 @@
# coding: utf8
from __future__ import unicode_literals
import re
from wasabi import Printer
from ...gold import iob_to_biluo

View File

@ -46,6 +46,11 @@ class GreekLemmatizer(object):
)
return lemmas
def lookup(self, string):
if string in self.lookup_table:
return self.lookup_table[string]
return string
def lemmatize(string, index, exceptions, rules):
string = string.lower()

View File

@ -1,8 +1,6 @@
# coding: utf8
from __future__ import unicode_literals
from pathlib import Path
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS

File diff suppressed because it is too large Load Diff

View File

@ -15,7 +15,6 @@ _abbrev_exc = [
{ORTH: "пет", LEMMA: "петак", NORM: "петак"},
{ORTH: "суб", LEMMA: "субота", NORM: "субота"},
{ORTH: "нед", LEMMA: "недеља", NORM: "недеља"},
# Months abbreviations
{ORTH: "јан", LEMMA: "јануар", NORM: "јануар"},
{ORTH: "феб", LEMMA: "фебруар", NORM: "фебруар"},
@ -28,7 +27,7 @@ _abbrev_exc = [
{ORTH: "септ", LEMMA: "септембар", NORM: "септембар"},
{ORTH: "окт", LEMMA: "октобар", NORM: "октобар"},
{ORTH: "нов", LEMMA: "новембар", NORM: "новембар"},
{ORTH: "дец", LEMMA: "децембар", NORM: "децембар"}
{ORTH: "дец", LEMMA: "децембар", NORM: "децембар"},
]

View File

@ -103,7 +103,13 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
text = "The players start."
heads = [1, 1, 0, -1]
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=["DT", "NN", "VBZ", "."], pos=["DET", "NOUN", "VERB", "PUNCT"], heads=heads)
doc = get_doc(
tokens.vocab,
words=[t.text for t in tokens],
tags=["DT", "NN", "VBZ", "."],
pos=["DET", "NOUN", "VERB", "PUNCT"],
heads=heads,
)
assert len(doc) == 4
assert doc[0].text == "The"
assert doc[0].tag_ == "DT"
@ -115,7 +121,13 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
assert doc[0].tag_ == "NN"
assert doc[0].pos_ == "NOUN"
assert doc[0].lemma_ == "The players"
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=["DT", "NN", "VBZ", "."], pos=["DET", "NOUN", "VERB", "PUNCT"], heads=heads)
doc = get_doc(
tokens.vocab,
words=[t.text for t in tokens],
tags=["DT", "NN", "VBZ", "."],
pos=["DET", "NOUN", "VERB", "PUNCT"],
heads=heads,
)
assert len(doc) == 4
assert doc[0].text == "The"
assert doc[0].tag_ == "DT"
@ -269,18 +281,15 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
# if there is a parse, span.root provides default values
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
heads = [ 0, -1, 1, -3, -4, -5, -1, -7, -8 ]
ents = [
(3, 5, "ent-de"),
(5, 7, "ent-fg"),
]
deps = ["dep"] * len(words)
heads = [0, -1, 1, -3, -4, -5, -1, -7, -8]
ents = [(3, 5, "ent-de"), (5, 7, "ent-fg")]
deps = ["dep"] * len(words)
en_vocab.strings.add("ent-de")
en_vocab.strings.add("ent-fg")
en_vocab.strings.add("dep")
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
assert doc[2:4].root == doc[3] # root of 'c d' is d
assert doc[4:6].root == doc[4] # root is 'e f' is e
assert doc[2:4].root == doc[3] # root of 'c d' is d
assert doc[4:6].root == doc[4] # root is 'e f' is e
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[2:4])
retokenizer.merge(doc[4:6])
@ -295,12 +304,9 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
# check that B is preserved if span[start] is B
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
heads = [ 0, -1, 1, 1, -4, -5, -1, -7, -8 ]
ents = [
(3, 5, "ent-de"),
(5, 7, "ent-de"),
]
deps = ["dep"] * len(words)
heads = [0, -1, 1, 1, -4, -5, -1, -7, -8]
ents = [(3, 5, "ent-de"), (5, 7, "ent-de")]
deps = ["dep"] * len(words)
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[3:5])

View File

@ -14,24 +14,24 @@ from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part
def test_issue1061():
'''Test special-case works after tokenizing. Was caching problem.'''
text = 'I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_.'
"""Test special-case works after tokenizing. Was caching problem."""
text = "I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_."
tokenizer = English.Defaults.create_tokenizer()
doc = tokenizer(text)
assert 'MATH' in [w.text for w in doc]
assert '_MATH_' not in [w.text for w in doc]
assert "MATH" in [w.text for w in doc]
assert "_MATH_" not in [w.text for w in doc]
tokenizer.add_special_case('_MATH_', [{ORTH: '_MATH_'}])
tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}])
doc = tokenizer(text)
assert '_MATH_' in [w.text for w in doc]
assert 'MATH' not in [w.text for w in doc]
assert "_MATH_" in [w.text for w in doc]
assert "MATH" not in [w.text for w in doc]
# For sanity, check it works when pipeline is clean.
tokenizer = English.Defaults.create_tokenizer()
tokenizer.add_special_case('_MATH_', [{ORTH: '_MATH_'}])
tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}])
doc = tokenizer(text)
assert '_MATH_' in [w.text for w in doc]
assert 'MATH' not in [w.text for w in doc]
assert "_MATH_" in [w.text for w in doc]
assert "MATH" not in [w.text for w in doc]
@pytest.mark.xfail(

View File

@ -1,7 +1,6 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
from spacy.matcher import Matcher
from spacy.tokens import Doc

View File

@ -1,7 +1,6 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
from spacy.matcher import Matcher
from spacy.tokens import Doc

View File

@ -1,7 +1,6 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc

View File

@ -1,7 +1,6 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
from spacy.matcher import Matcher
from spacy.tokens import Doc

View File

@ -2,44 +2,37 @@
from __future__ import unicode_literals
from spacy.lang.en import English
import spacy
from spacy.tokenizer import Tokenizer
from spacy import util
from spacy.tests.util import make_tempdir
from ..util import make_tempdir
def test_issue4190():
test_string = "Test c."
# Load default language
nlp_1 = English()
doc_1a = nlp_1(test_string)
result_1a = [token.text for token in doc_1a]
result_1a = [token.text for token in doc_1a] # noqa: F841
# Modify tokenizer
customize_tokenizer(nlp_1)
doc_1b = nlp_1(test_string)
result_1b = [token.text for token in doc_1b]
# Save and Reload
with make_tempdir() as model_dir:
nlp_1.to_disk(model_dir)
nlp_2 = spacy.load(model_dir)
nlp_2 = util.load_model(model_dir)
# This should be the modified tokenizer
doc_2 = nlp_2(test_string)
result_2 = [token.text for token in doc_2]
assert result_1b == result_2
def customize_tokenizer(nlp):
prefix_re = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes)
suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes)
infix_re = spacy.util.compile_infix_regex(nlp.Defaults.infixes)
# remove all exceptions where a single letter is followed by a period (e.g. 'h.')
prefix_re = util.compile_prefix_regex(nlp.Defaults.prefixes)
suffix_re = util.compile_suffix_regex(nlp.Defaults.suffixes)
infix_re = util.compile_infix_regex(nlp.Defaults.infixes)
# Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
exceptions = {
k: v
for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
@ -53,5 +46,4 @@ def customize_tokenizer(nlp):
infix_finditer=infix_re.finditer,
token_match=nlp.tokenizer.token_match,
)
nlp.tokenizer = new_tokenizer

View File

@ -0,0 +1,12 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.lang.el import Greek
def test_issue4272():
"""Test that lookup table can be accessed from Token.lemma if no POS tags
are available."""
nlp = Greek()
doc = nlp("Χθες")
assert doc[0].lemma_

View File

@ -56,6 +56,7 @@ def test_lookups_to_from_bytes():
assert table2.get("b") == 2
assert new_lookups.to_bytes() == lookups_bytes
# This fails on Python 3.5
@pytest.mark.xfail
def test_lookups_to_from_disk():
@ -76,6 +77,7 @@ def test_lookups_to_from_disk():
assert len(table2) == 3
assert table2.get("b") == 2
# This fails on Python 3.5
@pytest.mark.xfail
def test_lookups_to_from_bytes_via_vocab():