From cb06309ed81262b35def950b9978650c60ddab31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patrick=20D=C3=BCggelin?= Date: Thu, 12 May 2022 12:23:52 +0200 Subject: [PATCH] Fix PhraseMatcher remove overlapping terms (#10734) * Add regression test for issue 10643 * Improve overlapping terms testcase * Fix removing overlapping terms in phrase matcher (#10643) --- spacy/matcher/phrasematcher.pyx | 2 ++ spacy/tests/matcher/test_phrase_matcher.py | 30 ++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 2ff5105ad..382029872 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -118,6 +118,8 @@ cdef class PhraseMatcher: # if token is not found, break out of the loop current_node = NULL break + path_nodes.push_back(current_node) + path_keys.push_back(self._terminal_hash) # remove the tokens from trie node if there are no other # keywords with them result = map_get(current_node, self._terminal_hash) diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index f893d81f8..3b24f3ba8 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -122,6 +122,36 @@ def test_issue6839(en_vocab): assert matches +@pytest.mark.issue(10643) +def test_issue10643(en_vocab): + """Ensure overlapping terms can be removed from PhraseMatcher""" + + # fmt: off + words = ["Only", "save", "out", "the", "binary", "data", "for", "the", "individual", "components", "."] + # fmt: on + doc = Doc(en_vocab, words=words) + terms = { + "0": Doc(en_vocab, words=["binary"]), + "1": Doc(en_vocab, words=["binary", "data"]), + } + matcher = PhraseMatcher(en_vocab) + for match_id, term in terms.items(): + matcher.add(match_id, [term]) + + matches = matcher(doc) + assert matches == [(en_vocab.strings["0"], 4, 5), (en_vocab.strings["1"], 4, 6)] + + matcher.remove("0") + assert len(matcher) == 1 + new_matches = matcher(doc) + assert new_matches == [(en_vocab.strings["1"], 4, 6)] + + matcher.remove("1") + assert len(matcher) == 0 + no_matches = matcher(doc) + assert not no_matches + + def test_matcher_phrase_matcher(en_vocab): doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"]) # intermediate phrase