Fix PhraseMatcher.remove for overlapping patterns (#4437)

This commit is contained in:
adrianeboyd 2019-10-14 12:19:51 +02:00 committed by Ines Montani
parent f8f68bb062
commit 98a961a60e
2 changed files with 13 additions and 1 deletions

View File

@ -102,8 +102,10 @@ cdef class PhraseMatcher:
cdef vector[MapStruct*] path_nodes
cdef vector[key_t] path_keys
cdef key_t key_to_remove
for keyword in self._docs[key]:
for keyword in sorted(self._docs[key], key=lambda x: len(x), reverse=True):
current_node = self.c_map
path_nodes.clear()
path_keys.clear()
for token in keyword:
result = map_get(current_node, token)
if result:

View File

@ -226,3 +226,13 @@ def test_phrase_matcher_callback(en_vocab):
matcher.add("COMPANY", mock, pattern)
matches = matcher(doc)
mock.assert_called_once_with(matcher, doc, 0, matches)
def test_phrase_matcher_remove_overlapping_patterns(en_vocab):
matcher = PhraseMatcher(en_vocab)
pattern1 = Doc(en_vocab, words=["this"])
pattern2 = Doc(en_vocab, words=["this", "is"])
pattern3 = Doc(en_vocab, words=["this", "is", "a"])
pattern4 = Doc(en_vocab, words=["this", "is", "a", "word"])
matcher.add("THIS", None, pattern1, pattern2, pattern3, pattern4)
matcher.remove("THIS")