From 78a8bec4d0a0e607acd3f9a2c6eaafe54c7ca4ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 21 Feb 2022 15:02:21 +0100
Subject: [PATCH] Make core projectivization functions cdef nogil (#10241)

* Make core projectivization methods cdef nogil

While profiling the parser, I noticed that relatively a lot of time is
spent in projectivization. This change rewrites the functions in the
core loops as cdef nogil for efficiency.

In C++-land, we use vector in place of Python lists and absent heads
are represented as -1 in place of None.

* _heads_to_c: add assertion

Validation should be performed by the caller, but this assertion ensures that
we are not reading/writing out of bounds with incorrect input.
---
 spacy/pipeline/_parser_internals/nonproj.pyx | 83 +++++++++++++++-----
 spacy/tests/parser/test_nonproj.py           |  4 +-
 2 files changed, 66 insertions(+), 21 deletions(-)

diff --git a/spacy/pipeline/_parser_internals/nonproj.pyx b/spacy/pipeline/_parser_internals/nonproj.pyx
index 82070cd27..36163fcc3 100644
--- a/spacy/pipeline/_parser_internals/nonproj.pyx
+++ b/spacy/pipeline/_parser_internals/nonproj.pyx
@@ -4,6 +4,10 @@ for doing pseudo-projective parsing implementation uses the HEAD decoration
 scheme.
 """
 from copy import copy
+from libc.limits cimport INT_MAX
+from libc.stdlib cimport abs
+from libcpp cimport bool
+from libcpp.vector cimport vector
 
 from ...tokens.doc cimport Doc, set_children_from_heads
 
@@ -41,13 +45,18 @@ def contains_cycle(heads):
 
 
 def is_nonproj_arc(tokenid, heads):
+    cdef vector[int] c_heads = _heads_to_c(heads)
+    return _is_nonproj_arc(tokenid, c_heads)
+
+
+cdef bool _is_nonproj_arc(int tokenid, const vector[int]& heads) nogil:
     # definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective
     # if there is a token k, h < k < d such that h is not
     # an ancestor of k. Same for h -> d, h > d
     head = heads[tokenid]
     if head == tokenid:  # root arcs cannot be non-projective
         return False
-    elif head is None:  # unattached tokens cannot be non-projective
+    elif head < 0:  # unattached tokens cannot be non-projective
         return False
     
     cdef int start, end
@@ -56,19 +65,29 @@ def is_nonproj_arc(tokenid, heads):
     else:
         start, end = (tokenid+1, head)
     for k in range(start, end):
-        for ancestor in ancestors(k, heads):
-            if ancestor is None:  # for unattached tokens/subtrees
-                break
-            elif ancestor == head:  # normal case: k dominated by h
-                break
+        if _has_head_as_ancestor(k, head, heads):
+            continue
         else:  # head not in ancestors: d -> h is non-projective
             return True
     return False
 
 
+cdef bool _has_head_as_ancestor(int tokenid, int head, const vector[int]& heads) nogil:
+    ancestor = tokenid
+    cnt = 0
+    while cnt < heads.size():
+        if heads[ancestor] == head or heads[ancestor] < 0:
+            return True
+        ancestor = heads[ancestor]
+        cnt += 1
+
+    return False
+
+
 def is_nonproj_tree(heads):
+    cdef vector[int] c_heads = _heads_to_c(heads)
     # a tree is non-projective if at least one arc is non-projective
-    return any(is_nonproj_arc(word, heads) for word in range(len(heads)))
+    return any(_is_nonproj_arc(word, c_heads) for word in range(len(heads)))
 
 
 def decompose(label):
@@ -98,16 +117,31 @@ def projectivize(heads, labels):
     # tree, i.e. connected and cycle-free. Returns a new pair (heads, labels)
     # which encode a projective and decorated tree.
     proj_heads = copy(heads)
-    smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
-    if smallest_np_arc is None:  # this sentence is already projective
+
+    cdef int new_head
+    cdef vector[int] c_proj_heads = _heads_to_c(proj_heads)
+    cdef int smallest_np_arc = _get_smallest_nonproj_arc(c_proj_heads)
+    if smallest_np_arc == -1:  # this sentence is already projective
         return proj_heads, copy(labels)
-    while smallest_np_arc is not None:
-        _lift(smallest_np_arc, proj_heads)
-        smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
+    while smallest_np_arc != -1:
+        new_head = _lift(smallest_np_arc, proj_heads)
+        c_proj_heads[smallest_np_arc] = new_head
+        smallest_np_arc = _get_smallest_nonproj_arc(c_proj_heads)
     deco_labels = _decorate(heads, proj_heads, labels)
     return proj_heads, deco_labels
 
 
+cdef vector[int] _heads_to_c(heads):
+    cdef vector[int] c_heads;
+    for head in heads:
+        if head == None:
+            c_heads.push_back(-1)
+        else:
+            assert head < len(heads)
+            c_heads.push_back(head)
+    return c_heads
+
+
 cpdef deprojectivize(Doc doc):
     # Reattach arcs with decorated labels (following HEAD scheme). For each
     # decorated arc X||Y, search top-down, left-to-right, breadth-first until
@@ -137,27 +171,38 @@ def _decorate(heads, proj_heads, labels):
             deco_labels.append(labels[tokenid])
     return deco_labels
 
+def get_smallest_nonproj_arc_slow(heads):
+    cdef vector[int] c_heads = _heads_to_c(heads)
+    return _get_smallest_nonproj_arc(c_heads)
 
-def _get_smallest_nonproj_arc(heads):
+
+cdef int _get_smallest_nonproj_arc(const vector[int]& heads) nogil:
     # return the smallest non-proj arc or None
     # where size is defined as the distance between dep and head
     # and ties are broken left to right
-    smallest_size = float('inf')
-    smallest_np_arc = None
-    for tokenid, head in enumerate(heads):
+    cdef int smallest_size = INT_MAX
+    cdef int smallest_np_arc = -1
+    cdef int size
+    cdef int tokenid
+    cdef int head
+
+    for tokenid in range(heads.size()):
+        head = heads[tokenid]
         size = abs(tokenid-head)
-        if size < smallest_size and is_nonproj_arc(tokenid, heads):
+        if size < smallest_size and _is_nonproj_arc(tokenid, heads):
             smallest_size = size
             smallest_np_arc = tokenid
     return smallest_np_arc
 
 
-def _lift(tokenid, heads):
+cpdef int _lift(tokenid, heads):
     # reattaches a word to it's grandfather
     head = heads[tokenid]
     ghead = heads[head]
+    cdef int new_head = ghead if head != ghead else tokenid
     # attach to ghead if head isn't attached to root else attach to root
-    heads[tokenid] = ghead if head != ghead else tokenid
+    heads[tokenid] = new_head
+    return new_head
 
 
 def _find_new_head(token, headlabel):
diff --git a/spacy/tests/parser/test_nonproj.py b/spacy/tests/parser/test_nonproj.py
index 3957e4d77..60d000c44 100644
--- a/spacy/tests/parser/test_nonproj.py
+++ b/spacy/tests/parser/test_nonproj.py
@@ -93,8 +93,8 @@ def test_parser_pseudoprojectivity(en_vocab):
     assert nonproj.is_decorated("X") is False
     nonproj._lift(0, tree)
     assert tree == [2, 2, 2]
-    assert nonproj._get_smallest_nonproj_arc(nonproj_tree) == 7
-    assert nonproj._get_smallest_nonproj_arc(nonproj_tree2) == 10
+    assert nonproj.get_smallest_nonproj_arc_slow(nonproj_tree) == 7
+    assert nonproj.get_smallest_nonproj_arc_slow(nonproj_tree2) == 10
     # fmt: off
     proj_heads, deco_labels = nonproj.projectivize(nonproj_tree, labels)
     assert proj_heads == [1, 2, 2, 4, 5, 2, 7, 5, 2]