Implement full remove()

Remove unnecessary trie paths and free unused maps.

Parallel to Matcher, raise KeyError when attempting to remove a match ID
that has not been added.
This commit is contained in:
Adriane Boyd 2019-09-26 11:31:03 +02:00
parent 7862a6eb01
commit 3fdb22d832
2 changed files with 18 additions and 15 deletions

View File

@ -84,12 +84,13 @@ cdef class PhraseMatcher:
return (unpickle_matcher, data, None, None) return (unpickle_matcher, data, None, None)
def remove(self, key): def remove(self, key):
"""Remove a match-rule from the matcher by match ID. """Remove a rule from the matcher by match ID. A KeyError is raised if
the key does not exist.
key (unicode): The match ID. key (unicode): The match ID.
""" """
if key not in self._keywords: if key not in self._keywords:
return raise KeyError(key)
cdef MapStruct* current_node cdef MapStruct* current_node
cdef MapStruct* terminal_map cdef MapStruct* terminal_map
cdef MapStruct* node_pointer cdef MapStruct* node_pointer
@ -97,13 +98,16 @@ cdef class PhraseMatcher:
cdef key_t terminal_key cdef key_t terminal_key
cdef void* value cdef void* value
cdef int c_i = 0 cdef int c_i = 0
cdef vector[MapStruct*] path_nodes
cdef vector[key_t] path_keys
cdef key_t key_to_remove
for keyword in self._keywords[key]: for keyword in self._keywords[key]:
current_node = self.c_map current_node = self.c_map
token_trie_list = []
for token in keyword: for token in keyword:
result = map_get(current_node, token) result = map_get(current_node, token)
if result: if result:
token_trie_list.append((token, <uintptr_t>current_node)) path_nodes.push_back(current_node)
path_keys.push_back(token)
current_node = <MapStruct*>result current_node = <MapStruct*>result
else: else:
# if token is not found, break out of the loop # if token is not found, break out of the loop
@ -113,27 +117,25 @@ cdef class PhraseMatcher:
# keywords with them # keywords with them
result = map_get(current_node, self._terminal_hash) result = map_get(current_node, self._terminal_hash)
if current_node != NULL and result: if current_node != NULL and result:
# if this is the only remaining key, remove unnecessary paths
terminal_map = <MapStruct*>result terminal_map = <MapStruct*>result
terminal_keys = [] terminal_keys = []
c_i = 0 c_i = 0
while map_iter(terminal_map, &c_i, &terminal_key, &value): while map_iter(terminal_map, &c_i, &terminal_key, &value):
terminal_keys.append(self.vocab.strings[terminal_key]) terminal_keys.append(self.vocab.strings[terminal_key])
# TODO: not working, fix remove for unused paths/maps # if this is the only remaining key, remove unnecessary paths
if False and terminal_keys == [key]: if terminal_keys == [key]:
# we found a complete match for input keyword while not path_nodes.empty():
token_trie_list.append((self.vocab.strings[key], <uintptr_t>terminal_map)) node_pointer = path_nodes.back()
token_trie_list.reverse() path_nodes.pop_back()
for key_to_remove, py_node_pointer in token_trie_list: key_to_remove = path_keys.back()
node_pointer = <MapStruct*>py_node_pointer path_keys.pop_back()
result = map_get(node_pointer, key_to_remove) result = map_get(node_pointer, key_to_remove)
if node_pointer.filled == 1: if node_pointer.filled == 1:
map_clear(node_pointer, key_to_remove) map_clear(node_pointer, key_to_remove)
self.mem.free(result) self.mem.free(result)
pass
else: else:
# more than one key means more than 1 path, # more than one key means more than 1 path,
# delete not required path and keep the other # delete not required path and keep the others
map_clear(node_pointer, key_to_remove) map_clear(node_pointer, key_to_remove)
self.mem.free(result) self.mem.free(result)
break break

View File

@ -84,6 +84,7 @@ def test_phrase_matcher_remove(en_vocab):
assert "TEST2" not in matcher assert "TEST2" not in matcher
assert "TEST3" not in matcher assert "TEST3" not in matcher
assert len(matcher(doc)) == 0 assert len(matcher(doc)) == 0
with pytest.raises(KeyError):
matcher.remove("TEST3") matcher.remove("TEST3")
assert "TEST1" not in matcher assert "TEST1" not in matcher
assert "TEST2" not in matcher assert "TEST2" not in matcher