mirror of https://github.com/explosion/spaCy.git
247 lines
7.8 KiB
Python
247 lines
7.8 KiB
Python
# coding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
import pytest
|
|
import gc
|
|
import numpy
|
|
import copy
|
|
from spacy.lang.en import English
|
|
from spacy.lang.en.stop_words import STOP_WORDS
|
|
from spacy.lang.lex_attrs import is_stop
|
|
from spacy.vectors import Vectors
|
|
from spacy.vocab import Vocab
|
|
from spacy.language import Language
|
|
from spacy.tokens import Doc, Span
|
|
from spacy.pipeline import Tagger, EntityRecognizer
|
|
from spacy.attrs import HEAD, DEP
|
|
from spacy.matcher import Matcher
|
|
|
|
from ..util import make_tempdir
|
|
|
|
|
|
def test_issue1506():
|
|
def string_generator():
|
|
for _ in range(10001):
|
|
yield "It's sentence produced by that bug."
|
|
for _ in range(10001):
|
|
yield "I erase some hbdsaj lemmas."
|
|
for _ in range(10001):
|
|
yield "I erase lemmas."
|
|
for _ in range(10001):
|
|
yield "It's sentence produced by that bug."
|
|
for _ in range(10001):
|
|
yield "It's sentence produced by that bug."
|
|
|
|
nlp = English()
|
|
for i, d in enumerate(nlp.pipe(string_generator())):
|
|
# We should run cleanup more than one time to actually cleanup data.
|
|
# In first run — clean up only mark strings as «not hitted».
|
|
if i == 10000 or i == 20000 or i == 30000:
|
|
gc.collect()
|
|
for t in d:
|
|
str(t.lemma_)
|
|
|
|
|
|
def test_issue1518():
|
|
"""Test vectors.resize() works."""
|
|
vectors = Vectors(shape=(10, 10))
|
|
vectors.add('hello', row=2)
|
|
vectors.resize((5, 9))
|
|
|
|
|
|
def test_issue1537():
|
|
"""Test that Span.as_doc() doesn't segfault."""
|
|
string = 'The sky is blue . The man is pink . The dog is purple .'
|
|
doc = Doc(Vocab(), words=string.split())
|
|
doc[0].sent_start = True
|
|
for word in doc[1:]:
|
|
if word.nbor(-1).text == '.':
|
|
word.sent_start = True
|
|
else:
|
|
word.sent_start = False
|
|
sents = list(doc.sents)
|
|
sent0 = sents[0].as_doc()
|
|
sent1 = sents[1].as_doc()
|
|
assert isinstance(sent0, Doc)
|
|
assert isinstance(sent1, Doc)
|
|
|
|
|
|
# TODO: Currently segfaulting, due to l_edge and r_edge misalignment
|
|
#def test_issue1537_model():
|
|
# nlp = load_spacy('en')
|
|
# doc = nlp('The sky is blue. The man is pink. The dog is purple.')
|
|
# sents = [s.as_doc() for s in doc.sents]
|
|
# print(list(sents[0].noun_chunks))
|
|
# print(list(sents[1].noun_chunks))
|
|
|
|
|
|
def test_issue1539():
|
|
"""Ensure vectors.resize() doesn't try to modify dictionary during iteration."""
|
|
v = Vectors(shape=(10, 10), keys=[5,3,98,100])
|
|
v.resize((100,100))
|
|
|
|
|
|
def test_issue1547():
|
|
"""Test that entity labels still match after merging tokens."""
|
|
words = ['\n', 'worda', '.', '\n', 'wordb', '-', 'Biosphere', '2', '-', ' \n']
|
|
doc = Doc(Vocab(), words=words)
|
|
doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings['PRODUCT'])]
|
|
doc[5:7].merge()
|
|
assert [ent.text for ent in doc.ents]
|
|
|
|
|
|
def test_issue1612(en_tokenizer):
|
|
doc = en_tokenizer('The black cat purrs.')
|
|
span = doc[1: 3]
|
|
assert span.orth_ == span.text
|
|
|
|
|
|
def test_issue1654():
|
|
nlp = Language(Vocab())
|
|
assert not nlp.pipeline
|
|
nlp.add_pipe(lambda doc: doc, name='1')
|
|
nlp.add_pipe(lambda doc: doc, name='2', after='1')
|
|
nlp.add_pipe(lambda doc: doc, name='3', after='2')
|
|
assert nlp.pipe_names == ['1', '2', '3']
|
|
nlp2 = Language(Vocab())
|
|
assert not nlp2.pipeline
|
|
nlp2.add_pipe(lambda doc: doc, name='3')
|
|
nlp2.add_pipe(lambda doc: doc, name='2', before='3')
|
|
nlp2.add_pipe(lambda doc: doc, name='1', before='2')
|
|
assert nlp2.pipe_names == ['1', '2', '3']
|
|
|
|
|
|
@pytest.mark.parametrize('text', ['test@example.com', 'john.doe@example.co.uk'])
|
|
def test_issue1698(en_tokenizer, text):
|
|
doc = en_tokenizer(text)
|
|
assert len(doc) == 1
|
|
assert not doc[0].like_url
|
|
|
|
|
|
def test_issue1727():
|
|
"""Test that models with no pretrained vectors can be deserialized
|
|
correctly after vectors are added."""
|
|
data = numpy.ones((3, 300), dtype='f')
|
|
vectors = Vectors(data=data, keys=['I', 'am', 'Matt'])
|
|
tagger = Tagger(Vocab())
|
|
tagger.add_label('PRP')
|
|
tagger.begin_training()
|
|
assert tagger.cfg.get('pretrained_dims', 0) == 0
|
|
tagger.vocab.vectors = vectors
|
|
with make_tempdir() as path:
|
|
tagger.to_disk(path)
|
|
tagger = Tagger(Vocab()).from_disk(path)
|
|
assert tagger.cfg.get('pretrained_dims', 0) == 0
|
|
|
|
|
|
def test_issue1757():
|
|
"""Test comparison against None doesn't cause segfault."""
|
|
doc = Doc(Vocab(), words=['a', 'b', 'c'])
|
|
assert not doc[0] < None
|
|
assert not doc[0] == None
|
|
assert doc[0] >= None
|
|
assert not doc[:2] < None
|
|
assert not doc[:2] == None
|
|
assert doc[:2] >= None
|
|
assert not doc.vocab['a'] == None
|
|
assert not doc.vocab['a'] < None
|
|
|
|
|
|
def test_issue1758(en_tokenizer):
|
|
"""Test that "would've" is handled by the English tokenizer exceptions."""
|
|
tokens = en_tokenizer("would've")
|
|
assert len(tokens) == 2
|
|
assert tokens[0].tag_ == "MD"
|
|
assert tokens[1].lemma_ == "have"
|
|
|
|
|
|
def test_issue1799():
|
|
"""Test sentence boundaries are deserialized correctly, even for
|
|
non-projective sentences."""
|
|
heads_deps = numpy.asarray([[1, 397], [4, 436], [2, 426], [1, 402],
|
|
[0, 8206900633647566924], [18446744073709551615, 440],
|
|
[18446744073709551614, 442]], dtype='uint64')
|
|
doc = Doc(Vocab(), words='Just what I was looking for .'.split())
|
|
doc.vocab.strings.add('ROOT')
|
|
doc = doc.from_array([HEAD, DEP], heads_deps)
|
|
assert len(list(doc.sents)) == 1
|
|
|
|
|
|
def test_issue1807():
|
|
"""Test vocab.set_vector also adds the word to the vocab."""
|
|
vocab = Vocab()
|
|
assert 'hello' not in vocab
|
|
vocab.set_vector('hello', numpy.ones((50,), dtype='f'))
|
|
assert 'hello' in vocab
|
|
|
|
|
|
def test_issue1834():
|
|
"""Test that sentence boundaries & parse/tag flags are not lost
|
|
during serialization."""
|
|
string = "This is a first sentence . And another one"
|
|
doc = Doc(Vocab(), words=string.split())
|
|
doc[6].sent_start = True
|
|
new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
|
|
assert new_doc[6].sent_start
|
|
assert not new_doc.is_parsed
|
|
assert not new_doc.is_tagged
|
|
doc.is_parsed = True
|
|
doc.is_tagged = True
|
|
new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
|
|
assert new_doc.is_parsed
|
|
assert new_doc.is_tagged
|
|
|
|
|
|
def test_issue1868():
|
|
"""Test Vocab.__contains__ works with int keys."""
|
|
vocab = Vocab()
|
|
lex = vocab['hello']
|
|
assert lex.orth in vocab
|
|
assert lex.orth_ in vocab
|
|
assert 'some string' not in vocab
|
|
int_id = vocab.strings.add('some string')
|
|
assert int_id not in vocab
|
|
|
|
|
|
def test_issue1883():
|
|
matcher = Matcher(Vocab())
|
|
matcher.add('pat1', None, [{'orth': 'hello'}])
|
|
doc = Doc(matcher.vocab, words=['hello'])
|
|
assert len(matcher(doc)) == 1
|
|
new_matcher = copy.deepcopy(matcher)
|
|
new_doc = Doc(new_matcher.vocab, words=['hello'])
|
|
assert len(new_matcher(new_doc)) == 1
|
|
|
|
|
|
@pytest.mark.parametrize('word', ['the'])
|
|
def test_issue1889(word):
|
|
assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)
|
|
|
|
|
|
def test_issue1915():
|
|
cfg = {'hidden_depth': 2} # should error out
|
|
nlp = Language()
|
|
nlp.add_pipe(nlp.create_pipe('ner'))
|
|
nlp.get_pipe('ner').add_label('answer')
|
|
with pytest.raises(ValueError):
|
|
nlp.begin_training(**cfg)
|
|
|
|
|
|
def test_issue1945():
|
|
"""Test regression in Matcher introduced in v2.0.6."""
|
|
matcher = Matcher(Vocab())
|
|
matcher.add('MWE', None, [{'orth': 'a'}, {'orth': 'a'}])
|
|
doc = Doc(matcher.vocab, words=['a', 'a', 'a'])
|
|
matches = matcher(doc) # we should see two overlapping matches here
|
|
assert len(matches) == 2
|
|
assert matches[0][1:] == (0, 2)
|
|
assert matches[1][1:] == (1, 3)
|
|
|
|
|
|
@pytest.mark.parametrize('label', ['U-JOB-NAME'])
|
|
def test_issue1967(label):
|
|
ner = EntityRecognizer(Vocab())
|
|
entry = ([0], ['word'], ['tag'], [0], ['dep'], [label])
|
|
gold_parses = [(None, [(entry, None)])]
|
|
ner.moves.get_actions(gold_parses=gold_parses)
|