mirror of https://github.com/explosion/spaCy.git
478 lines
18 KiB
Python
478 lines
18 KiB
Python
import pytest
|
||
from spacy.language import Language
|
||
from spacy.vocab import Vocab
|
||
from spacy.pipeline import EntityRuler, DependencyParser
|
||
from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
||
from spacy import displacy, load
|
||
from spacy.displacy import parse_deps
|
||
from spacy.tokens import Doc, Token
|
||
from spacy.matcher import Matcher, PhraseMatcher
|
||
from spacy.errors import MatchPatternError
|
||
from spacy.util import minibatch
|
||
from spacy.gold import Example
|
||
from spacy.lang.hi import Hindi
|
||
from spacy.lang.es import Spanish
|
||
from spacy.lang.en import English
|
||
from spacy.attrs import IS_ALPHA
|
||
from spacy import registry
|
||
from thinc.api import compounding
|
||
import spacy
|
||
import srsly
|
||
import numpy
|
||
|
||
from ..util import make_tempdir, get_doc
|
||
|
||
|
||
@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
|
||
def test_issue3521(en_tokenizer, word):
|
||
tok = en_tokenizer(word)[1]
|
||
# 'not' and 'would' should be stopwords, also in their abbreviated forms
|
||
assert tok.is_stop
|
||
|
||
|
||
def test_issue_3526_1(en_vocab):
|
||
patterns = [
|
||
{"label": "HELLO", "pattern": "hello world"},
|
||
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
||
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
||
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
|
||
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
|
||
]
|
||
nlp = Language(vocab=en_vocab)
|
||
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
||
ruler_bytes = ruler.to_bytes()
|
||
assert len(ruler) == len(patterns)
|
||
assert len(ruler.labels) == 4
|
||
assert ruler.overwrite
|
||
new_ruler = EntityRuler(nlp)
|
||
new_ruler = new_ruler.from_bytes(ruler_bytes)
|
||
assert len(new_ruler) == len(ruler)
|
||
assert len(new_ruler.labels) == 4
|
||
assert new_ruler.overwrite == ruler.overwrite
|
||
assert new_ruler.ent_id_sep == ruler.ent_id_sep
|
||
|
||
|
||
def test_issue_3526_2(en_vocab):
|
||
patterns = [
|
||
{"label": "HELLO", "pattern": "hello world"},
|
||
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
||
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
||
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
|
||
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
|
||
]
|
||
nlp = Language(vocab=en_vocab)
|
||
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
||
bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
|
||
new_ruler = EntityRuler(nlp)
|
||
new_ruler = new_ruler.from_bytes(bytes_old_style)
|
||
assert len(new_ruler) == len(ruler)
|
||
for pattern in ruler.patterns:
|
||
assert pattern in new_ruler.patterns
|
||
assert new_ruler.overwrite is not ruler.overwrite
|
||
|
||
|
||
def test_issue_3526_3(en_vocab):
|
||
patterns = [
|
||
{"label": "HELLO", "pattern": "hello world"},
|
||
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
||
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
||
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
|
||
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
|
||
]
|
||
nlp = Language(vocab=en_vocab)
|
||
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
||
with make_tempdir() as tmpdir:
|
||
out_file = tmpdir / "entity_ruler"
|
||
srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
|
||
new_ruler = EntityRuler(nlp).from_disk(out_file)
|
||
for pattern in ruler.patterns:
|
||
assert pattern in new_ruler.patterns
|
||
assert len(new_ruler) == len(ruler)
|
||
assert new_ruler.overwrite is not ruler.overwrite
|
||
|
||
|
||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||
def test_issue_3526_4(en_vocab):
|
||
nlp = Language(vocab=en_vocab)
|
||
patterns = [{"label": "ORG", "pattern": "Apple"}]
|
||
config = {"overwrite_ents": True}
|
||
ruler = nlp.add_pipe("entity_ruler", config=config)
|
||
ruler.add_patterns(patterns)
|
||
with make_tempdir() as tmpdir:
|
||
nlp.to_disk(tmpdir)
|
||
ruler = nlp.get_pipe("entity_ruler")
|
||
assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
|
||
assert ruler.overwrite is True
|
||
nlp2 = load(tmpdir)
|
||
new_ruler = nlp2.get_pipe("entity_ruler")
|
||
assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
|
||
assert new_ruler.overwrite is True
|
||
|
||
|
||
def test_issue3531():
|
||
"""Test that displaCy renderer doesn't require "settings" key."""
|
||
example_dep = {
|
||
"words": [
|
||
{"text": "But", "tag": "CCONJ"},
|
||
{"text": "Google", "tag": "PROPN"},
|
||
{"text": "is", "tag": "VERB"},
|
||
{"text": "starting", "tag": "VERB"},
|
||
{"text": "from", "tag": "ADP"},
|
||
{"text": "behind.", "tag": "ADV"},
|
||
],
|
||
"arcs": [
|
||
{"start": 0, "end": 3, "label": "cc", "dir": "left"},
|
||
{"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
|
||
{"start": 2, "end": 3, "label": "aux", "dir": "left"},
|
||
{"start": 3, "end": 4, "label": "prep", "dir": "right"},
|
||
{"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
|
||
],
|
||
}
|
||
example_ent = {
|
||
"text": "But Google is starting from behind.",
|
||
"ents": [{"start": 4, "end": 10, "label": "ORG"}],
|
||
}
|
||
dep_html = displacy.render(example_dep, style="dep", manual=True)
|
||
assert dep_html
|
||
ent_html = displacy.render(example_ent, style="ent", manual=True)
|
||
assert ent_html
|
||
|
||
|
||
def test_issue3540(en_vocab):
|
||
words = ["I", "live", "in", "NewYork", "right", "now"]
|
||
tensor = numpy.asarray(
|
||
[[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
|
||
dtype="f",
|
||
)
|
||
doc = Doc(en_vocab, words=words)
|
||
doc.tensor = tensor
|
||
gold_text = ["I", "live", "in", "NewYork", "right", "now"]
|
||
assert [token.text for token in doc] == gold_text
|
||
gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
|
||
assert [token.lemma_ for token in doc] == gold_lemma
|
||
vectors_1 = [token.vector for token in doc]
|
||
assert len(vectors_1) == len(doc)
|
||
|
||
with doc.retokenize() as retokenizer:
|
||
heads = [(doc[3], 1), doc[2]]
|
||
attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
|
||
retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
|
||
|
||
gold_text = ["I", "live", "in", "New", "York", "right", "now"]
|
||
assert [token.text for token in doc] == gold_text
|
||
gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
|
||
assert [token.lemma_ for token in doc] == gold_lemma
|
||
vectors_2 = [token.vector for token in doc]
|
||
assert len(vectors_2) == len(doc)
|
||
assert vectors_1[0].tolist() == vectors_2[0].tolist()
|
||
assert vectors_1[1].tolist() == vectors_2[1].tolist()
|
||
assert vectors_1[2].tolist() == vectors_2[2].tolist()
|
||
assert vectors_1[4].tolist() == vectors_2[5].tolist()
|
||
assert vectors_1[5].tolist() == vectors_2[6].tolist()
|
||
|
||
|
||
def test_issue3549(en_vocab):
|
||
"""Test that match pattern validation doesn't raise on empty errors."""
|
||
matcher = Matcher(en_vocab, validate=True)
|
||
pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
|
||
matcher.add("GOOD", [pattern])
|
||
with pytest.raises(MatchPatternError):
|
||
matcher.add("BAD", [[{"X": "Y"}]])
|
||
|
||
|
||
@pytest.mark.skip("Matching currently only works on strings and integers")
|
||
def test_issue3555(en_vocab):
|
||
"""Test that custom extensions with default None don't break matcher."""
|
||
Token.set_extension("issue3555", default=None)
|
||
matcher = Matcher(en_vocab)
|
||
pattern = [{"ORTH": "have"}, {"_": {"issue3555": True}}]
|
||
matcher.add("TEST", [pattern])
|
||
doc = Doc(en_vocab, words=["have", "apple"])
|
||
matcher(doc)
|
||
|
||
|
||
def test_issue3611():
|
||
""" Test whether adding n-grams in the textcat works even when n > token length of some docs """
|
||
unique_classes = ["offensive", "inoffensive"]
|
||
x_train = [
|
||
"This is an offensive text",
|
||
"This is the second offensive text",
|
||
"inoff",
|
||
]
|
||
y_train = ["offensive", "offensive", "inoffensive"]
|
||
nlp = spacy.blank("en")
|
||
# preparing the data
|
||
train_data = []
|
||
for text, train_instance in zip(x_train, y_train):
|
||
cat_dict = {label: label == train_instance for label in unique_classes}
|
||
train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
|
||
# add a text categorizer component
|
||
model = {
|
||
"@architectures": "spacy.TextCatBOW.v1",
|
||
"exclusive_classes": True,
|
||
"ngram_size": 2,
|
||
"no_output_layer": False,
|
||
}
|
||
textcat = nlp.add_pipe("textcat", config={"model": model}, last=True)
|
||
for label in unique_classes:
|
||
textcat.add_label(label)
|
||
# training the network
|
||
with nlp.select_pipes(enable="textcat"):
|
||
optimizer = nlp.begin_training()
|
||
for i in range(3):
|
||
losses = {}
|
||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||
|
||
for batch in batches:
|
||
nlp.update(
|
||
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
|
||
)
|
||
|
||
|
||
def test_issue3625():
|
||
"""Test that default punctuation rules applies to hindi unicode characters"""
|
||
nlp = Hindi()
|
||
doc = nlp("hi. how हुए. होटल, होटल")
|
||
expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
|
||
assert [token.text for token in doc] == expected
|
||
|
||
|
||
def test_issue3803():
|
||
"""Test that spanish num-like tokens have True for like_num attribute."""
|
||
nlp = Spanish()
|
||
text = "2 dos 1000 mil 12 doce"
|
||
doc = nlp(text)
|
||
|
||
assert [t.like_num for t in doc] == [True, True, True, True, True, True]
|
||
|
||
|
||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||
def test_issue3830_no_subtok():
|
||
"""Test that the parser doesn't have subtok label if not learn_tokens"""
|
||
config = {
|
||
"learn_tokens": False,
|
||
"min_action_freq": 30,
|
||
"update_with_oracle_cut_size": 100,
|
||
}
|
||
model = registry.make_from_config({"model": DEFAULT_PARSER_MODEL}, validate=True)[
|
||
"model"
|
||
]
|
||
parser = DependencyParser(Vocab(), model, **config)
|
||
parser.add_label("nsubj")
|
||
assert "subtok" not in parser.labels
|
||
parser.begin_training(lambda: [])
|
||
assert "subtok" not in parser.labels
|
||
|
||
|
||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||
def test_issue3830_with_subtok():
|
||
"""Test that the parser does have subtok label if learn_tokens=True."""
|
||
config = {
|
||
"learn_tokens": True,
|
||
"min_action_freq": 30,
|
||
"update_with_oracle_cut_size": 100,
|
||
}
|
||
model = registry.make_from_config({"model": DEFAULT_PARSER_MODEL}, validate=True)[
|
||
"model"
|
||
]
|
||
parser = DependencyParser(Vocab(), model, **config)
|
||
parser.add_label("nsubj")
|
||
assert "subtok" not in parser.labels
|
||
parser.begin_training(lambda: [])
|
||
assert "subtok" in parser.labels
|
||
|
||
|
||
def test_issue3839(en_vocab):
|
||
"""Test that match IDs returned by the matcher are correct, are in the string """
|
||
doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
|
||
matcher = Matcher(en_vocab)
|
||
match_id = "PATTERN"
|
||
pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
|
||
pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
|
||
matcher.add(match_id, [pattern1])
|
||
matches = matcher(doc)
|
||
assert matches[0][0] == en_vocab.strings[match_id]
|
||
matcher = Matcher(en_vocab)
|
||
matcher.add(match_id, [pattern2])
|
||
matches = matcher(doc)
|
||
assert matches[0][0] == en_vocab.strings[match_id]
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
"sentence",
|
||
[
|
||
"The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
|
||
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
|
||
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
|
||
"Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
|
||
"It was a missed assignment, but it shouldn't have resulted in a turnover ...",
|
||
],
|
||
)
|
||
def test_issue3869(sentence):
|
||
"""Test that the Doc's count_by function works consistently"""
|
||
nlp = English()
|
||
doc = nlp(sentence)
|
||
count = 0
|
||
for token in doc:
|
||
count += token.is_alpha
|
||
assert count == doc.count_by(IS_ALPHA).get(1, 0)
|
||
|
||
|
||
def test_issue3879(en_vocab):
|
||
doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
|
||
assert len(doc) == 5
|
||
pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
|
||
matcher = Matcher(en_vocab)
|
||
matcher.add("TEST", [pattern])
|
||
assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test'
|
||
|
||
|
||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||
def test_issue3880():
|
||
"""Test that `nlp.pipe()` works when an empty string ends the batch.
|
||
|
||
Fixed in v7.0.5 of Thinc.
|
||
"""
|
||
texts = ["hello", "world", "", ""]
|
||
nlp = English()
|
||
nlp.add_pipe("parser").add_label("dep")
|
||
nlp.add_pipe("ner").add_label("PERSON")
|
||
nlp.add_pipe("tagger").add_label("NN")
|
||
nlp.begin_training()
|
||
for doc in nlp.pipe(texts):
|
||
pass
|
||
|
||
|
||
def test_issue3882(en_vocab):
|
||
"""Test that displaCy doesn't serialize the doc.user_data when making a
|
||
copy of the Doc.
|
||
"""
|
||
doc = Doc(en_vocab, words=["Hello", "world"])
|
||
doc.is_parsed = True
|
||
doc.user_data["test"] = set()
|
||
parse_deps(doc)
|
||
|
||
|
||
def test_issue3951(en_vocab):
|
||
"""Test that combinations of optional rules are matched correctly."""
|
||
matcher = Matcher(en_vocab)
|
||
pattern = [
|
||
{"LOWER": "hello"},
|
||
{"LOWER": "this", "OP": "?"},
|
||
{"OP": "?"},
|
||
{"LOWER": "world"},
|
||
]
|
||
matcher.add("TEST", [pattern])
|
||
doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
|
||
matches = matcher(doc)
|
||
assert len(matches) == 0
|
||
|
||
|
||
def test_issue3959():
|
||
""" Ensure that a modified pos attribute is serialized correctly."""
|
||
nlp = English()
|
||
doc = nlp(
|
||
"displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
|
||
)
|
||
assert doc[0].pos_ == ""
|
||
doc[0].pos_ = "NOUN"
|
||
assert doc[0].pos_ == "NOUN"
|
||
# usually this is already True when starting from proper models instead of blank English
|
||
doc.is_tagged = True
|
||
with make_tempdir() as tmp_dir:
|
||
file_path = tmp_dir / "my_doc"
|
||
doc.to_disk(file_path)
|
||
doc2 = nlp("")
|
||
doc2.from_disk(file_path)
|
||
assert doc2[0].pos_ == "NOUN"
|
||
|
||
|
||
def test_issue3962(en_vocab):
|
||
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
||
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
||
# fmt: off
|
||
words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
|
||
heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
|
||
deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
|
||
# fmt: on
|
||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||
span2 = doc[1:5] # "jests at scars ,"
|
||
doc2 = span2.as_doc()
|
||
doc2_json = doc2.to_json()
|
||
assert doc2_json
|
||
# head set to itself, being the new artificial root
|
||
assert doc2[0].head.text == "jests"
|
||
assert doc2[0].dep_ == "dep"
|
||
assert doc2[1].head.text == "jests"
|
||
assert doc2[1].dep_ == "prep"
|
||
assert doc2[2].head.text == "at"
|
||
assert doc2[2].dep_ == "pobj"
|
||
assert doc2[3].head.text == "jests" # head set to the new artificial root
|
||
assert doc2[3].dep_ == "dep"
|
||
# We should still have 1 sentence
|
||
assert len(list(doc2.sents)) == 1
|
||
span3 = doc[6:9] # "never felt a"
|
||
doc3 = span3.as_doc()
|
||
doc3_json = doc3.to_json()
|
||
assert doc3_json
|
||
assert doc3[0].head.text == "felt"
|
||
assert doc3[0].dep_ == "neg"
|
||
assert doc3[1].head.text == "felt"
|
||
assert doc3[1].dep_ == "ROOT"
|
||
assert doc3[2].head.text == "felt" # head set to ancestor
|
||
assert doc3[2].dep_ == "dep"
|
||
# We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
|
||
assert len(list(doc3.sents)) == 1
|
||
|
||
|
||
def test_issue3962_long(en_vocab):
|
||
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
||
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
||
# fmt: off
|
||
words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
|
||
heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
|
||
deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
|
||
# fmt: on
|
||
two_sent_doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||
span2 = two_sent_doc[1:7] # "jests at scars. They never"
|
||
doc2 = span2.as_doc()
|
||
doc2_json = doc2.to_json()
|
||
assert doc2_json
|
||
# head set to itself, being the new artificial root (in sentence 1)
|
||
assert doc2[0].head.text == "jests"
|
||
assert doc2[0].dep_ == "ROOT"
|
||
assert doc2[1].head.text == "jests"
|
||
assert doc2[1].dep_ == "prep"
|
||
assert doc2[2].head.text == "at"
|
||
assert doc2[2].dep_ == "pobj"
|
||
assert doc2[3].head.text == "jests"
|
||
assert doc2[3].dep_ == "punct"
|
||
# head set to itself, being the new artificial root (in sentence 2)
|
||
assert doc2[4].head.text == "They"
|
||
assert doc2[4].dep_ == "dep"
|
||
# head set to the new artificial head (in sentence 2)
|
||
assert doc2[4].head.text == "They"
|
||
assert doc2[4].dep_ == "dep"
|
||
# We should still have 2 sentences
|
||
sents = list(doc2.sents)
|
||
assert len(sents) == 2
|
||
assert sents[0].text == "jests at scars ."
|
||
assert sents[1].text == "They never"
|
||
|
||
|
||
def test_issue3972(en_vocab):
|
||
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs.
|
||
"""
|
||
matcher = PhraseMatcher(en_vocab)
|
||
matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
|
||
matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
|
||
doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
|
||
matches = matcher(doc)
|
||
|
||
assert len(matches) == 2
|
||
|
||
# We should have a match for each of the two rules
|
||
found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
|
||
assert "A" in found_ids
|
||
assert "B" in found_ids
|