From cb06309ed81262b35def950b9978650c60ddab31 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Patrick=20D=C3=BCggelin?= <patrick.dueggelin@live.de>
Date: Thu, 12 May 2022 12:23:52 +0200
Subject: [PATCH] Fix PhraseMatcher remove overlapping terms (#10734)

* Add regression test for issue 10643

* Improve overlapping terms testcase

* Fix removing overlapping terms in phrase matcher (#10643)
---
 spacy/matcher/phrasematcher.pyx            |  2 ++
 spacy/tests/matcher/test_phrase_matcher.py | 30 ++++++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 2ff5105ad..382029872 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -118,6 +118,8 @@ cdef class PhraseMatcher:
                     # if token is not found, break out of the loop
                     current_node = NULL
                     break
+            path_nodes.push_back(current_node)
+            path_keys.push_back(self._terminal_hash)
             # remove the tokens from trie node if there are no other
             # keywords with them
             result = map_get(current_node, self._terminal_hash)
diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py
index f893d81f8..3b24f3ba8 100644
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@@ -122,6 +122,36 @@ def test_issue6839(en_vocab):
     assert matches
 
 
+@pytest.mark.issue(10643)
+def test_issue10643(en_vocab):
+    """Ensure overlapping terms can be removed from PhraseMatcher"""
+
+    # fmt: off
+    words = ["Only", "save", "out", "the", "binary", "data", "for", "the", "individual", "components", "."]
+    # fmt: on
+    doc = Doc(en_vocab, words=words)
+    terms = {
+        "0": Doc(en_vocab, words=["binary"]),
+        "1": Doc(en_vocab, words=["binary", "data"]),
+    }
+    matcher = PhraseMatcher(en_vocab)
+    for match_id, term in terms.items():
+        matcher.add(match_id, [term])
+
+    matches = matcher(doc)
+    assert matches == [(en_vocab.strings["0"], 4, 5), (en_vocab.strings["1"], 4, 6)]
+
+    matcher.remove("0")
+    assert len(matcher) == 1
+    new_matches = matcher(doc)
+    assert new_matches == [(en_vocab.strings["1"], 4, 6)]
+
+    matcher.remove("1")
+    assert len(matcher) == 0
+    no_matches = matcher(doc)
+    assert not no_matches
+
+
 def test_matcher_phrase_matcher(en_vocab):
     doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
     # intermediate phrase