diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 33b24c129..b13c9acf8 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -102,8 +102,10 @@ cdef class PhraseMatcher: cdef vector[MapStruct*] path_nodes cdef vector[key_t] path_keys cdef key_t key_to_remove - for keyword in self._docs[key]: + for keyword in sorted(self._docs[key], key=lambda x: len(x), reverse=True): current_node = self.c_map + path_nodes.clear() + path_keys.clear() for token in keyword: result = map_get(current_node, token) if result: diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index ad00e2323..2a7532e85 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -226,3 +226,13 @@ def test_phrase_matcher_callback(en_vocab): matcher.add("COMPANY", mock, pattern) matches = matcher(doc) mock.assert_called_once_with(matcher, doc, 0, matches) + + +def test_phrase_matcher_remove_overlapping_patterns(en_vocab): + matcher = PhraseMatcher(en_vocab) + pattern1 = Doc(en_vocab, words=["this"]) + pattern2 = Doc(en_vocab, words=["this", "is"]) + pattern3 = Doc(en_vocab, words=["this", "is", "a"]) + pattern4 = Doc(en_vocab, words=["this", "is", "a", "word"]) + matcher.add("THIS", None, pattern1, pattern2, pattern3, pattern4) + matcher.remove("THIS")