mirror of https://github.com/explosion/spaCy.git
Implement full remove()
Remove unnecessary trie paths and free unused maps. Parallel to Matcher, raise KeyError when attempting to remove a match ID that has not been added.
This commit is contained in:
parent
7862a6eb01
commit
3fdb22d832
|
@ -84,12 +84,13 @@ cdef class PhraseMatcher:
|
||||||
return (unpickle_matcher, data, None, None)
|
return (unpickle_matcher, data, None, None)
|
||||||
|
|
||||||
def remove(self, key):
|
def remove(self, key):
|
||||||
"""Remove a match-rule from the matcher by match ID.
|
"""Remove a rule from the matcher by match ID. A KeyError is raised if
|
||||||
|
the key does not exist.
|
||||||
|
|
||||||
key (unicode): The match ID.
|
key (unicode): The match ID.
|
||||||
"""
|
"""
|
||||||
if key not in self._keywords:
|
if key not in self._keywords:
|
||||||
return
|
raise KeyError(key)
|
||||||
cdef MapStruct* current_node
|
cdef MapStruct* current_node
|
||||||
cdef MapStruct* terminal_map
|
cdef MapStruct* terminal_map
|
||||||
cdef MapStruct* node_pointer
|
cdef MapStruct* node_pointer
|
||||||
|
@ -97,13 +98,16 @@ cdef class PhraseMatcher:
|
||||||
cdef key_t terminal_key
|
cdef key_t terminal_key
|
||||||
cdef void* value
|
cdef void* value
|
||||||
cdef int c_i = 0
|
cdef int c_i = 0
|
||||||
|
cdef vector[MapStruct*] path_nodes
|
||||||
|
cdef vector[key_t] path_keys
|
||||||
|
cdef key_t key_to_remove
|
||||||
for keyword in self._keywords[key]:
|
for keyword in self._keywords[key]:
|
||||||
current_node = self.c_map
|
current_node = self.c_map
|
||||||
token_trie_list = []
|
|
||||||
for token in keyword:
|
for token in keyword:
|
||||||
result = map_get(current_node, token)
|
result = map_get(current_node, token)
|
||||||
if result:
|
if result:
|
||||||
token_trie_list.append((token, <uintptr_t>current_node))
|
path_nodes.push_back(current_node)
|
||||||
|
path_keys.push_back(token)
|
||||||
current_node = <MapStruct*>result
|
current_node = <MapStruct*>result
|
||||||
else:
|
else:
|
||||||
# if token is not found, break out of the loop
|
# if token is not found, break out of the loop
|
||||||
|
@ -113,27 +117,25 @@ cdef class PhraseMatcher:
|
||||||
# keywords with them
|
# keywords with them
|
||||||
result = map_get(current_node, self._terminal_hash)
|
result = map_get(current_node, self._terminal_hash)
|
||||||
if current_node != NULL and result:
|
if current_node != NULL and result:
|
||||||
# if this is the only remaining key, remove unnecessary paths
|
|
||||||
terminal_map = <MapStruct*>result
|
terminal_map = <MapStruct*>result
|
||||||
terminal_keys = []
|
terminal_keys = []
|
||||||
c_i = 0
|
c_i = 0
|
||||||
while map_iter(terminal_map, &c_i, &terminal_key, &value):
|
while map_iter(terminal_map, &c_i, &terminal_key, &value):
|
||||||
terminal_keys.append(self.vocab.strings[terminal_key])
|
terminal_keys.append(self.vocab.strings[terminal_key])
|
||||||
# TODO: not working, fix remove for unused paths/maps
|
# if this is the only remaining key, remove unnecessary paths
|
||||||
if False and terminal_keys == [key]:
|
if terminal_keys == [key]:
|
||||||
# we found a complete match for input keyword
|
while not path_nodes.empty():
|
||||||
token_trie_list.append((self.vocab.strings[key], <uintptr_t>terminal_map))
|
node_pointer = path_nodes.back()
|
||||||
token_trie_list.reverse()
|
path_nodes.pop_back()
|
||||||
for key_to_remove, py_node_pointer in token_trie_list:
|
key_to_remove = path_keys.back()
|
||||||
node_pointer = <MapStruct*>py_node_pointer
|
path_keys.pop_back()
|
||||||
result = map_get(node_pointer, key_to_remove)
|
result = map_get(node_pointer, key_to_remove)
|
||||||
if node_pointer.filled == 1:
|
if node_pointer.filled == 1:
|
||||||
map_clear(node_pointer, key_to_remove)
|
map_clear(node_pointer, key_to_remove)
|
||||||
self.mem.free(result)
|
self.mem.free(result)
|
||||||
pass
|
|
||||||
else:
|
else:
|
||||||
# more than one key means more than 1 path,
|
# more than one key means more than 1 path,
|
||||||
# delete not required path and keep the other
|
# delete not required path and keep the others
|
||||||
map_clear(node_pointer, key_to_remove)
|
map_clear(node_pointer, key_to_remove)
|
||||||
self.mem.free(result)
|
self.mem.free(result)
|
||||||
break
|
break
|
||||||
|
|
|
@ -84,6 +84,7 @@ def test_phrase_matcher_remove(en_vocab):
|
||||||
assert "TEST2" not in matcher
|
assert "TEST2" not in matcher
|
||||||
assert "TEST3" not in matcher
|
assert "TEST3" not in matcher
|
||||||
assert len(matcher(doc)) == 0
|
assert len(matcher(doc)) == 0
|
||||||
|
with pytest.raises(KeyError):
|
||||||
matcher.remove("TEST3")
|
matcher.remove("TEST3")
|
||||||
assert "TEST1" not in matcher
|
assert "TEST1" not in matcher
|
||||||
assert "TEST2" not in matcher
|
assert "TEST2" not in matcher
|
||||||
|
|
Loading…
Reference in New Issue