From f12b0433085645b6bbd74816240a37aab0c29bfd Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 17 Apr 2016 15:19:17 +0200 Subject: [PATCH 1/3] * Add test for Issue #242: Overlapping matches not well recognised. --- spacy/tests/matcher/test_matcher_bugfixes.py | 27 ++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/spacy/tests/matcher/test_matcher_bugfixes.py b/spacy/tests/matcher/test_matcher_bugfixes.py index 275c942c5..a80ec43b5 100644 --- a/spacy/tests/matcher/test_matcher_bugfixes.py +++ b/spacy/tests/matcher/test_matcher_bugfixes.py @@ -1,8 +1,11 @@ import pytest import numpy +import os +import spacy from spacy.matcher import Matcher from spacy.attrs import ORTH, LOWER, ENT_IOB, ENT_TYPE +from spacy.attrs import ORTH, TAG, LOWER, IS_ALPHA, FLAG63 from spacy.symbols import DATE @@ -31,6 +34,30 @@ def test_overlap_issue118(EN): assert ents[0].end == 11 +def test_overlap_issue242(): + '''Test bug from multi-word phrases breaking text representation.''' + + patterns = [ + [{LOWER: 'food'}, {LOWER: 'safety'}], + [{LOWER: 'safety'}, {LOWER: 'standards'}], + ] + + if os.environ.get('SPACY_DATA'): + data_dir = os.environ.get('SPACY_DATA') + else: + data_dir = None + + nlp = spacy.en.English(data_dir=data_dir, tagger=False, parser=False, entity=False) + + nlp.matcher.add('FOOD', 'FOOD', {}, patterns) + + doc = nlp(u'There are different food safety standards in different countries.') + + food_safety, safety_standards = doc.ents + assert food_safety.text == u'food safety' + assert safety_standards.text == u'safety standards' + + def test_overlap_reorder(EN): '''Test order dependence''' doc = EN.tokenizer(u'how many points did lebron james score against the boston celtics last night') From 2b419d5b8c9dbf33763dd4b99c219de12306c1d9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 17 Apr 2016 15:34:23 +0200 Subject: [PATCH 2/3] * Update test for Issue #242 --- spacy/tests/matcher/test_matcher_bugfixes.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/spacy/tests/matcher/test_matcher_bugfixes.py b/spacy/tests/matcher/test_matcher_bugfixes.py index a80ec43b5..a125d1a99 100644 --- a/spacy/tests/matcher/test_matcher_bugfixes.py +++ b/spacy/tests/matcher/test_matcher_bugfixes.py @@ -51,11 +51,12 @@ def test_overlap_issue242(): nlp.matcher.add('FOOD', 'FOOD', {}, patterns) - doc = nlp(u'There are different food safety standards in different countries.') - - food_safety, safety_standards = doc.ents - assert food_safety.text == u'food safety' - assert safety_standards.text == u'safety standards' + doc = nlp.tokenizer(u'There are different food safety standards in different countries.') + food_safety, safety_standards = nlp.matcher(doc) + assert food_safety[1] == 3 + assert food_safety[2] == 5 + assert safety_standards[1] == 4 + assert safety_standards[2] == 6 def test_overlap_reorder(EN): From 2add5206aa34f9e393afe51df09483d6a3fcc2d7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 17 Apr 2016 15:40:21 +0200 Subject: [PATCH 3/3] * Fix description of matcher test --- spacy/tests/matcher/test_matcher_bugfixes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/matcher/test_matcher_bugfixes.py b/spacy/tests/matcher/test_matcher_bugfixes.py index a125d1a99..e7b9c75b0 100644 --- a/spacy/tests/matcher/test_matcher_bugfixes.py +++ b/spacy/tests/matcher/test_matcher_bugfixes.py @@ -35,7 +35,7 @@ def test_overlap_issue118(EN): def test_overlap_issue242(): - '''Test bug from multi-word phrases breaking text representation.''' + '''Test overlapping multi-word phrases.''' patterns = [ [{LOWER: 'food'}, {LOWER: 'safety'}],