mirror of https://github.com/explosion/spaCy.git
Speed up the StateC::L feature function (#10019)
* Speed up the StateC::L feature function This function gets the n-th most-recent left-arc with a particular head. Before this change, StateC::L would construct a vector of all left-arcs with the given head and then pick the n-th most recent from that vector. Since the number of left-arcs strongly correlates with the doc length and the feature is constructed for every transition, this can make transition-parsing quadratic. With this change StateC::L: - Searches left-arcs backwards. - Stops early when the n-th matching transition is found. - Does not construct a vector (reducing memory pressure). This change doesn't avoid the linear search when the transition that is queried does not occur in the left-arcs. Regardless, performance is improved quite a bit with very long docs: Before: N Time 400 3.3 800 5.4 1600 11.6 3200 30.7 After: N Time 400 3.2 800 5.0 1600 9.5 3200 23.2 We can probably do better with more tailored data structures, but I first wanted to make a low-impact PR. Found while investigating #9858. * StateC::L: simplify loop
This commit is contained in:
parent
176a90edee
commit
677c1a3507
|
@ -1,3 +1,4 @@
|
||||||
|
from cython.operator cimport dereference as deref, preincrement as incr
|
||||||
from libc.string cimport memcpy, memset
|
from libc.string cimport memcpy, memset
|
||||||
from libc.stdlib cimport calloc, free
|
from libc.stdlib cimport calloc, free
|
||||||
from libc.stdint cimport uint32_t, uint64_t
|
from libc.stdint cimport uint32_t, uint64_t
|
||||||
|
@ -184,16 +185,20 @@ cdef cppclass StateC:
|
||||||
int L(int head, int idx) nogil const:
|
int L(int head, int idx) nogil const:
|
||||||
if idx < 1 or this._left_arcs.size() == 0:
|
if idx < 1 or this._left_arcs.size() == 0:
|
||||||
return -1
|
return -1
|
||||||
cdef vector[int] lefts
|
|
||||||
for i in range(this._left_arcs.size()):
|
# Work backwards through left-arcs to find the arc at the
|
||||||
arc = this._left_arcs.at(i)
|
# requested index more quickly.
|
||||||
|
cdef size_t child_index = 0
|
||||||
|
it = this._left_arcs.const_rbegin()
|
||||||
|
while it != this._left_arcs.rend():
|
||||||
|
arc = deref(it)
|
||||||
if arc.head == head and arc.child != -1 and arc.child < head:
|
if arc.head == head and arc.child != -1 and arc.child < head:
|
||||||
lefts.push_back(arc.child)
|
child_index += 1
|
||||||
idx = (<int>lefts.size()) - idx
|
if child_index == idx:
|
||||||
if idx < 0:
|
return arc.child
|
||||||
return -1
|
incr(it)
|
||||||
else:
|
|
||||||
return lefts.at(idx)
|
return -1
|
||||||
|
|
||||||
int R(int head, int idx) nogil const:
|
int R(int head, int idx) nogil const:
|
||||||
if idx < 1 or this._right_arcs.size() == 0:
|
if idx < 1 or this._right_arcs.size() == 0:
|
||||||
|
|
Loading…
Reference in New Issue