* Add test for Issue #242: Overlapping matches not well recognised.

This commit is contained in:
Matthew Honnibal 2016-04-17 15:19:17 +02:00
parent b98cc3266d
commit f12b043308
1 changed files with 27 additions and 0 deletions

View File

@ -1,8 +1,11 @@
import pytest import pytest
import numpy import numpy
import os
import spacy
from spacy.matcher import Matcher from spacy.matcher import Matcher
from spacy.attrs import ORTH, LOWER, ENT_IOB, ENT_TYPE from spacy.attrs import ORTH, LOWER, ENT_IOB, ENT_TYPE
from spacy.attrs import ORTH, TAG, LOWER, IS_ALPHA, FLAG63
from spacy.symbols import DATE from spacy.symbols import DATE
@ -31,6 +34,30 @@ def test_overlap_issue118(EN):
assert ents[0].end == 11 assert ents[0].end == 11
def test_overlap_issue242():
'''Test bug from multi-word phrases breaking text representation.'''
patterns = [
[{LOWER: 'food'}, {LOWER: 'safety'}],
[{LOWER: 'safety'}, {LOWER: 'standards'}],
]
if os.environ.get('SPACY_DATA'):
data_dir = os.environ.get('SPACY_DATA')
else:
data_dir = None
nlp = spacy.en.English(data_dir=data_dir, tagger=False, parser=False, entity=False)
nlp.matcher.add('FOOD', 'FOOD', {}, patterns)
doc = nlp(u'There are different food safety standards in different countries.')
food_safety, safety_standards = doc.ents
assert food_safety.text == u'food safety'
assert safety_standards.text == u'safety standards'
def test_overlap_reorder(EN): def test_overlap_reorder(EN):
'''Test order dependence''' '''Test order dependence'''
doc = EN.tokenizer(u'how many points did lebron james score against the boston celtics last night') doc = EN.tokenizer(u'how many points did lebron james score against the boston celtics last night')