From 78a8bec4d0a0e607acd3f9a2c6eaafe54c7ca4ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Mon, 21 Feb 2022 15:02:21 +0100 Subject: [PATCH] Make core projectivization functions cdef nogil (#10241) * Make core projectivization methods cdef nogil While profiling the parser, I noticed that relatively a lot of time is spent in projectivization. This change rewrites the functions in the core loops as cdef nogil for efficiency. In C++-land, we use vector in place of Python lists and absent heads are represented as -1 in place of None. * _heads_to_c: add assertion Validation should be performed by the caller, but this assertion ensures that we are not reading/writing out of bounds with incorrect input. --- spacy/pipeline/_parser_internals/nonproj.pyx | 83 +++++++++++++++----- spacy/tests/parser/test_nonproj.py | 4 +- 2 files changed, 66 insertions(+), 21 deletions(-) diff --git a/spacy/pipeline/_parser_internals/nonproj.pyx b/spacy/pipeline/_parser_internals/nonproj.pyx index 82070cd27..36163fcc3 100644 --- a/spacy/pipeline/_parser_internals/nonproj.pyx +++ b/spacy/pipeline/_parser_internals/nonproj.pyx @@ -4,6 +4,10 @@ for doing pseudo-projective parsing implementation uses the HEAD decoration scheme. """ from copy import copy +from libc.limits cimport INT_MAX +from libc.stdlib cimport abs +from libcpp cimport bool +from libcpp.vector cimport vector from ...tokens.doc cimport Doc, set_children_from_heads @@ -41,13 +45,18 @@ def contains_cycle(heads): def is_nonproj_arc(tokenid, heads): + cdef vector[int] c_heads = _heads_to_c(heads) + return _is_nonproj_arc(tokenid, c_heads) + + +cdef bool _is_nonproj_arc(int tokenid, const vector[int]& heads) nogil: # definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective # if there is a token k, h < k < d such that h is not # an ancestor of k. Same for h -> d, h > d head = heads[tokenid] if head == tokenid: # root arcs cannot be non-projective return False - elif head is None: # unattached tokens cannot be non-projective + elif head < 0: # unattached tokens cannot be non-projective return False cdef int start, end @@ -56,19 +65,29 @@ def is_nonproj_arc(tokenid, heads): else: start, end = (tokenid+1, head) for k in range(start, end): - for ancestor in ancestors(k, heads): - if ancestor is None: # for unattached tokens/subtrees - break - elif ancestor == head: # normal case: k dominated by h - break + if _has_head_as_ancestor(k, head, heads): + continue else: # head not in ancestors: d -> h is non-projective return True return False +cdef bool _has_head_as_ancestor(int tokenid, int head, const vector[int]& heads) nogil: + ancestor = tokenid + cnt = 0 + while cnt < heads.size(): + if heads[ancestor] == head or heads[ancestor] < 0: + return True + ancestor = heads[ancestor] + cnt += 1 + + return False + + def is_nonproj_tree(heads): + cdef vector[int] c_heads = _heads_to_c(heads) # a tree is non-projective if at least one arc is non-projective - return any(is_nonproj_arc(word, heads) for word in range(len(heads))) + return any(_is_nonproj_arc(word, c_heads) for word in range(len(heads))) def decompose(label): @@ -98,16 +117,31 @@ def projectivize(heads, labels): # tree, i.e. connected and cycle-free. Returns a new pair (heads, labels) # which encode a projective and decorated tree. proj_heads = copy(heads) - smallest_np_arc = _get_smallest_nonproj_arc(proj_heads) - if smallest_np_arc is None: # this sentence is already projective + + cdef int new_head + cdef vector[int] c_proj_heads = _heads_to_c(proj_heads) + cdef int smallest_np_arc = _get_smallest_nonproj_arc(c_proj_heads) + if smallest_np_arc == -1: # this sentence is already projective return proj_heads, copy(labels) - while smallest_np_arc is not None: - _lift(smallest_np_arc, proj_heads) - smallest_np_arc = _get_smallest_nonproj_arc(proj_heads) + while smallest_np_arc != -1: + new_head = _lift(smallest_np_arc, proj_heads) + c_proj_heads[smallest_np_arc] = new_head + smallest_np_arc = _get_smallest_nonproj_arc(c_proj_heads) deco_labels = _decorate(heads, proj_heads, labels) return proj_heads, deco_labels +cdef vector[int] _heads_to_c(heads): + cdef vector[int] c_heads; + for head in heads: + if head == None: + c_heads.push_back(-1) + else: + assert head < len(heads) + c_heads.push_back(head) + return c_heads + + cpdef deprojectivize(Doc doc): # Reattach arcs with decorated labels (following HEAD scheme). For each # decorated arc X||Y, search top-down, left-to-right, breadth-first until @@ -137,27 +171,38 @@ def _decorate(heads, proj_heads, labels): deco_labels.append(labels[tokenid]) return deco_labels +def get_smallest_nonproj_arc_slow(heads): + cdef vector[int] c_heads = _heads_to_c(heads) + return _get_smallest_nonproj_arc(c_heads) -def _get_smallest_nonproj_arc(heads): + +cdef int _get_smallest_nonproj_arc(const vector[int]& heads) nogil: # return the smallest non-proj arc or None # where size is defined as the distance between dep and head # and ties are broken left to right - smallest_size = float('inf') - smallest_np_arc = None - for tokenid, head in enumerate(heads): + cdef int smallest_size = INT_MAX + cdef int smallest_np_arc = -1 + cdef int size + cdef int tokenid + cdef int head + + for tokenid in range(heads.size()): + head = heads[tokenid] size = abs(tokenid-head) - if size < smallest_size and is_nonproj_arc(tokenid, heads): + if size < smallest_size and _is_nonproj_arc(tokenid, heads): smallest_size = size smallest_np_arc = tokenid return smallest_np_arc -def _lift(tokenid, heads): +cpdef int _lift(tokenid, heads): # reattaches a word to it's grandfather head = heads[tokenid] ghead = heads[head] + cdef int new_head = ghead if head != ghead else tokenid # attach to ghead if head isn't attached to root else attach to root - heads[tokenid] = ghead if head != ghead else tokenid + heads[tokenid] = new_head + return new_head def _find_new_head(token, headlabel): diff --git a/spacy/tests/parser/test_nonproj.py b/spacy/tests/parser/test_nonproj.py index 3957e4d77..60d000c44 100644 --- a/spacy/tests/parser/test_nonproj.py +++ b/spacy/tests/parser/test_nonproj.py @@ -93,8 +93,8 @@ def test_parser_pseudoprojectivity(en_vocab): assert nonproj.is_decorated("X") is False nonproj._lift(0, tree) assert tree == [2, 2, 2] - assert nonproj._get_smallest_nonproj_arc(nonproj_tree) == 7 - assert nonproj._get_smallest_nonproj_arc(nonproj_tree2) == 10 + assert nonproj.get_smallest_nonproj_arc_slow(nonproj_tree) == 7 + assert nonproj.get_smallest_nonproj_arc_slow(nonproj_tree2) == 10 # fmt: off proj_heads, deco_labels = nonproj.projectivize(nonproj_tree, labels) assert proj_heads == [1, 2, 2, 4, 5, 2, 7, 5, 2]