spaCy/spacy/matcher/dependencymatcher.pyx

353 lines
13 KiB
Cython

# cython: infer_types=True
# cython: profile=True
from __future__ import unicode_literals
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
from .matcher cimport Matcher
from ..vocab cimport Vocab
from ..tokens.doc cimport Doc
from .matcher import unpickle_matcher
from ..errors import Errors
from libcpp cimport bool
import numpy
DELIMITER = "||"
INDEX_HEAD = 1
INDEX_RELOP = 0
cdef class DependencyMatcher:
"""Match dependency parse tree based on pattern rules."""
cdef Pool mem
cdef readonly Vocab vocab
cdef readonly Matcher token_matcher
cdef public object _patterns
cdef public object _keys_to_token
cdef public object _root
cdef public object _entities
cdef public object _callbacks
cdef public object _nodes
cdef public object _tree
def __init__(self, vocab):
"""Create the DependencyMatcher.
vocab (Vocab): The vocabulary object, which must be shared with the
documents the matcher will operate on.
RETURNS (DependencyMatcher): The newly constructed object.
"""
size = 20
self.token_matcher = Matcher(vocab)
self._keys_to_token = {}
self._patterns = {}
self._root = {}
self._nodes = {}
self._tree = {}
self._entities = {}
self._callbacks = {}
self.vocab = vocab
self.mem = Pool()
def __reduce__(self):
data = (self.vocab, self._patterns,self._tree, self._callbacks)
return (unpickle_matcher, data, None, None)
def __len__(self):
"""Get the number of rules, which are edges, added to the dependency
tree matcher.
RETURNS (int): The number of rules.
"""
return len(self._patterns)
def __contains__(self, key):
"""Check whether the matcher contains rules for a match ID.
key (unicode): The match ID.
RETURNS (bool): Whether the matcher contains rules for this match ID.
"""
return self._normalize_key(key) in self._patterns
def validateInput(self, pattern, key):
idx = 0
visitedNodes = {}
for relation in pattern:
if "PATTERN" not in relation or "SPEC" not in relation:
raise ValueError(Errors.E098.format(key=key))
if idx == 0:
if not(
"NODE_NAME" in relation["SPEC"]
and "NBOR_RELOP" not in relation["SPEC"]
and "NBOR_NAME" not in relation["SPEC"]
):
raise ValueError(Errors.E099.format(key=key))
visitedNodes[relation["SPEC"]["NODE_NAME"]] = True
else:
if not(
"NODE_NAME" in relation["SPEC"]
and "NBOR_RELOP" in relation["SPEC"]
and "NBOR_NAME" in relation["SPEC"]
):
raise ValueError(Errors.E100.format(key=key))
if (
relation["SPEC"]["NODE_NAME"] in visitedNodes
or relation["SPEC"]["NBOR_NAME"] not in visitedNodes
):
raise ValueError(Errors.E101.format(key=key))
visitedNodes[relation["SPEC"]["NODE_NAME"]] = True
visitedNodes[relation["SPEC"]["NBOR_NAME"]] = True
idx = idx + 1
def add(self, key, patterns, *_patterns, on_match=None):
if patterns is None or hasattr(patterns, "__call__"): # old API
on_match = patterns
patterns = _patterns
for pattern in patterns:
if len(pattern) == 0:
raise ValueError(Errors.E012.format(key=key))
self.validateInput(pattern,key)
key = self._normalize_key(key)
_patterns = []
for pattern in patterns:
token_patterns = []
for i in range(len(pattern)):
token_pattern = [pattern[i]["PATTERN"]]
token_patterns.append(token_pattern)
# self.patterns.append(token_patterns)
_patterns.append(token_patterns)
self._patterns.setdefault(key, [])
self._callbacks[key] = on_match
self._patterns[key].extend(_patterns)
# Add each node pattern of all the input patterns individually to the
# matcher. This enables only a single instance of Matcher to be used.
# Multiple adds are required to track each node pattern.
_keys_to_token_list = []
for i in range(len(_patterns)):
_keys_to_token = {}
# TODO: Better ways to hash edges in pattern?
for j in range(len(_patterns[i])):
k = self._normalize_key(unicode(key) + DELIMITER + unicode(i) + DELIMITER + unicode(j))
self.token_matcher.add(k, None, _patterns[i][j])
_keys_to_token[k] = j
_keys_to_token_list.append(_keys_to_token)
self._keys_to_token.setdefault(key, [])
self._keys_to_token[key].extend(_keys_to_token_list)
_nodes_list = []
for pattern in patterns:
nodes = {}
for i in range(len(pattern)):
nodes[pattern[i]["SPEC"]["NODE_NAME"]] = i
_nodes_list.append(nodes)
self._nodes.setdefault(key, [])
self._nodes[key].extend(_nodes_list)
# Create an object tree to traverse later on. This data structure
# enables easy tree pattern match. Doc-Token based tree cannot be
# reused since it is memory-heavy and tightly coupled with the Doc.
self.retrieve_tree(patterns, _nodes_list,key)
def retrieve_tree(self, patterns, _nodes_list, key):
_heads_list = []
_root_list = []
for i in range(len(patterns)):
heads = {}
root = -1
for j in range(len(patterns[i])):
token_pattern = patterns[i][j]
if ("NBOR_RELOP" not in token_pattern["SPEC"]):
heads[j] = ('root', j)
root = j
else:
heads[j] = (
token_pattern["SPEC"]["NBOR_RELOP"],
_nodes_list[i][token_pattern["SPEC"]["NBOR_NAME"]]
)
_heads_list.append(heads)
_root_list.append(root)
_tree_list = []
for i in range(len(patterns)):
tree = {}
for j in range(len(patterns[i])):
if(_heads_list[i][j][INDEX_HEAD] == j):
continue
head = _heads_list[i][j][INDEX_HEAD]
if(head not in tree):
tree[head] = []
tree[head].append((_heads_list[i][j][INDEX_RELOP], j))
_tree_list.append(tree)
self._tree.setdefault(key, [])
self._tree[key].extend(_tree_list)
self._root.setdefault(key, [])
self._root[key].extend(_root_list)
def has_key(self, key):
"""Check whether the matcher has a rule with a given key.
key (string or int): The key to check.
RETURNS (bool): Whether the matcher has the rule.
"""
key = self._normalize_key(key)
return key in self._patterns
def get(self, key, default=None):
"""Retrieve the pattern stored for a key.
key (unicode or int): The key to retrieve.
RETURNS (tuple): The rule, as an (on_match, patterns) tuple.
"""
key = self._normalize_key(key)
if key not in self._patterns:
return default
return (self._callbacks[key], self._patterns[key])
def __call__(self, Doc doc):
matched_key_trees = []
matches = self.token_matcher(doc)
for key in list(self._patterns.keys()):
_patterns_list = self._patterns[key]
_keys_to_token_list = self._keys_to_token[key]
_root_list = self._root[key]
_tree_list = self._tree[key]
_nodes_list = self._nodes[key]
length = len(_patterns_list)
for i in range(length):
_keys_to_token = _keys_to_token_list[i]
_root = _root_list[i]
_tree = _tree_list[i]
_nodes = _nodes_list[i]
id_to_position = {}
for i in range(len(_nodes)):
id_to_position[i]=[]
# TODO: This could be taken outside to improve running time..?
for match_id, start, end in matches:
if match_id in _keys_to_token:
id_to_position[_keys_to_token[match_id]].append(start)
_node_operator_map = self.get_node_operator_map(
doc,
_tree,
id_to_position,
_nodes,_root
)
length = len(_nodes)
matched_trees = []
self.recurse(_tree,id_to_position,_node_operator_map,0,[],matched_trees)
matched_key_trees.append((key,matched_trees))
for i, (ent_id, nodes) in enumerate(matched_key_trees):
on_match = self._callbacks.get(ent_id)
if on_match is not None:
on_match(self, doc, i, matched_key_trees)
return matched_key_trees
def recurse(self,tree,id_to_position,_node_operator_map,int patternLength,visitedNodes,matched_trees):
cdef bool isValid;
if(patternLength == len(id_to_position.keys())):
isValid = True
for node in range(patternLength):
if(node in tree):
for idx, (relop,nbor) in enumerate(tree[node]):
computed_nbors = numpy.asarray(_node_operator_map[visitedNodes[node]][relop])
isNbor = False
for computed_nbor in computed_nbors:
if(computed_nbor.i == visitedNodes[nbor]):
isNbor = True
isValid = isValid & isNbor
if(isValid):
matched_trees.append(visitedNodes)
return
allPatternNodes = numpy.asarray(id_to_position[patternLength])
for patternNode in allPatternNodes:
self.recurse(tree,id_to_position,_node_operator_map,patternLength+1,visitedNodes+[patternNode],matched_trees)
# Given a node and an edge operator, to return the list of nodes
# from the doc that belong to node+operator. This is used to store
# all the results beforehand to prevent unnecessary computation while
# pattern matching
# _node_operator_map[node][operator] = [...]
def get_node_operator_map(self,doc,tree,id_to_position,nodes,root):
_node_operator_map = {}
all_node_indices = nodes.values()
all_operators = []
for node in all_node_indices:
if node in tree:
for child in tree[node]:
all_operators.append(child[INDEX_RELOP])
all_operators = list(set(all_operators))
all_nodes = []
for node in all_node_indices:
all_nodes = all_nodes + id_to_position[node]
all_nodes = list(set(all_nodes))
for node in all_nodes:
_node_operator_map[node] = {}
for operator in all_operators:
_node_operator_map[node][operator] = []
# Used to invoke methods for each operator
switcher = {
"<": self.dep,
">": self.gov,
"<<": self.dep_chain,
">>": self.gov_chain,
".": self.imm_precede,
"$+": self.imm_right_sib,
"$-": self.imm_left_sib,
"$++": self.right_sib,
"$--": self.left_sib
}
for operator in all_operators:
for node in all_nodes:
_node_operator_map[node][operator] = switcher.get(operator)(doc,node)
return _node_operator_map
def dep(self, doc, node):
return [doc[node].head]
def gov(self,doc,node):
return list(doc[node].children)
def dep_chain(self, doc, node):
return list(doc[node].ancestors)
def gov_chain(self, doc, node):
return list(doc[node].subtree)
def imm_precede(self, doc, node):
if node > 0:
return [doc[node - 1]]
return []
def imm_right_sib(self, doc, node):
for child in list(doc[node].head.children):
if child.i == node - 1:
return [doc[child.i]]
return []
def imm_left_sib(self, doc, node):
for child in list(doc[node].head.children):
if child.i == node + 1:
return [doc[child.i]]
return []
def right_sib(self, doc, node):
candidate_children = []
for child in list(doc[node].head.children):
if child.i < node:
candidate_children.append(doc[child.i])
return candidate_children
def left_sib(self, doc, node):
candidate_children = []
for child in list(doc[node].head.children):
if child.i > node:
candidate_children.append(doc[child.i])
return candidate_children
def _normalize_key(self, key):
if isinstance(key, basestring):
return self.vocab.strings.add(key)
else:
return key