# cython: infer_types=True """Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005 for doing pseudo-projective parsing implementation uses the HEAD decoration scheme. """ from copy import copy from cython.operator cimport dereference as deref from cython.operator cimport preincrement as incr from libc.limits cimport INT_MAX from libc.stdlib cimport abs from libcpp cimport bool from libcpp.string cimport string, to_string from libcpp.unordered_set cimport unordered_set from libcpp.vector cimport vector from ...tokens.doc cimport Doc, set_children_from_heads from ...errors import Errors DELIMITER = '||' def ancestors(tokenid, heads): # Returns all words going from the word up the path to the root. The path # to root cannot be longer than the number of words in the sentence. This # function ends after at most len(heads) steps, because it would otherwise # loop indefinitely on cycles. head = tokenid cnt = 0 while heads[head] != head and cnt < len(heads): head = heads[head] cnt += 1 yield head if head is None: break def contains_cycle(heads): # in an acyclic tree, the path from each word following the head relation # upwards always ends at the root node for tokenid in range(len(heads)): seen = set([tokenid]) for ancestor in ancestors(tokenid, heads): if ancestor in seen: return seen seen.add(ancestor) return None def is_nonproj_arc(tokenid, heads): cdef vector[int] c_heads = _heads_to_c(heads) return _is_nonproj_arc(tokenid, c_heads) cdef bool _is_nonproj_arc(int tokenid, const vector[int]& heads) nogil except *: # definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective # if there is a token k, h < k < d such that h is not # an ancestor of k. Same for h -> d, h > d head = heads[tokenid] if head == tokenid: # root arcs cannot be non-projective return False elif head < 0: # unattached tokens cannot be non-projective return False cdef int start, end if head < tokenid: start, end = (head+1, tokenid) else: start, end = (tokenid+1, head) for k in range(start, end): if not _has_head_as_ancestor(k, head, heads): return True return False cdef bool _has_head_as_ancestor(int tokenid, int head, const vector[int]& heads) nogil except *: ancestor = tokenid cdef unordered_set[int] seen_tokens seen_tokens.insert(ancestor) while True: # Reached the head or a disconnected node if heads[ancestor] == head or heads[ancestor] < 0: return True # Reached the root if heads[ancestor] == ancestor: return False ancestor = heads[ancestor] result = seen_tokens.insert(ancestor) # Found cycle if not result.second: raise_domain_error(heads_to_string(heads)) return False cdef string heads_to_string(const vector[int]& heads) nogil: cdef vector[int].const_iterator citer cdef string cycle_str cycle_str.append("Found cycle in dependency graph: [") # FIXME: Rewrite using ostringstream when available in Cython. citer = heads.const_begin() while citer != heads.const_end(): if citer != heads.const_begin(): cycle_str.append(", ") cycle_str.append(to_string(deref(citer))) incr(citer) cycle_str.append("]") return cycle_str def is_nonproj_tree(heads): cdef vector[int] c_heads = _heads_to_c(heads) # a tree is non-projective if at least one arc is non-projective return any(_is_nonproj_arc(word, c_heads) for word in range(len(heads))) def decompose(label): return label.partition(DELIMITER)[::2] def is_decorated(label): return DELIMITER in label def count_decorated_labels(gold_data): freqs = {} for example in gold_data: proj_heads, deco_deps = projectivize(example.get_aligned("HEAD"), example.get_aligned("DEP")) # set the label to ROOT for each root dependent deco_deps = [ 'ROOT' if head == i else deco_deps[i] for i, head in enumerate(proj_heads) ] # count label frequencies for label in deco_deps: if is_decorated(label): freqs[label] = freqs.get(label, 0) + 1 return freqs def projectivize(heads, labels): # Use the algorithm by Nivre & Nilsson 2005. Assumes heads to be a proper # tree, i.e. connected and cycle-free. Returns a new pair (heads, labels) # which encode a projective and decorated tree. proj_heads = copy(heads) cdef int new_head cdef vector[int] c_proj_heads = _heads_to_c(proj_heads) cdef int smallest_np_arc = _get_smallest_nonproj_arc(c_proj_heads) if smallest_np_arc == -1: # this sentence is already projective return proj_heads, copy(labels) while smallest_np_arc != -1: new_head = _lift(smallest_np_arc, proj_heads) c_proj_heads[smallest_np_arc] = new_head smallest_np_arc = _get_smallest_nonproj_arc(c_proj_heads) deco_labels = _decorate(heads, proj_heads, labels) return proj_heads, deco_labels cdef vector[int] _heads_to_c(heads): cdef vector[int] c_heads for head in heads: if head is None: c_heads.push_back(-1) else: assert head < len(heads) c_heads.push_back(head) return c_heads cpdef deprojectivize(Doc doc): # Reattach arcs with decorated labels (following HEAD scheme). For each # decorated arc X||Y, search top-down, left-to-right, breadth-first until # hitting a Y then make this the new head. for i in range(doc.length): label = doc.vocab.strings[doc.c[i].dep] if DELIMITER in label: new_label, head_label = label.split(DELIMITER) new_head = _find_new_head(doc[i], head_label) doc.c[i].head = new_head.i - i doc.c[i].dep = doc.vocab.strings.add(new_label, allow_transient=False) set_children_from_heads(doc.c, 0, doc.length) return doc def _decorate(heads, proj_heads, labels): # uses decoration scheme HEAD from Nivre & Nilsson 2005 if (len(heads) != len(proj_heads)) or (len(proj_heads) != len(labels)): raise ValueError(Errors.E082.format(n_heads=len(heads), n_proj_heads=len(proj_heads), n_labels=len(labels))) deco_labels = [] for tokenid, head in enumerate(heads): if head != proj_heads[tokenid]: deco_labels.append(f"{labels[tokenid]}{DELIMITER}{labels[head]}") else: deco_labels.append(labels[tokenid]) return deco_labels def get_smallest_nonproj_arc_slow(heads): cdef vector[int] c_heads = _heads_to_c(heads) return _get_smallest_nonproj_arc(c_heads) cdef int _get_smallest_nonproj_arc(const vector[int]& heads) nogil except -2: # return the smallest non-proj arc or None # where size is defined as the distance between dep and head # and ties are broken left to right cdef int smallest_size = INT_MAX # -1 means its already projective. cdef int smallest_np_arc = -1 cdef int size cdef int tokenid cdef int head for tokenid in range(heads.size()): head = heads[tokenid] size = abs(tokenid-head) if size < smallest_size and _is_nonproj_arc(tokenid, heads): smallest_size = size smallest_np_arc = tokenid return smallest_np_arc cpdef int _lift(tokenid, heads): # reattaches a word to it's grandfather head = heads[tokenid] ghead = heads[head] cdef int new_head = ghead if head != ghead else tokenid # attach to ghead if head isn't attached to root else attach to root heads[tokenid] = new_head return new_head def _find_new_head(token, headlabel): # search through the tree starting from the head of the given token # returns the id of the first descendant with the given label # if there is none, return the current head (no change) queue = [token.head] while queue: next_queue = [] for qtoken in queue: for child in qtoken.children: if child.is_space: continue if child == token: continue if child.dep_ == headlabel: return child next_queue.append(child) queue = next_queue return token.head