From bcf8f7ba402552962a2bb948aa0a0df4be3dc5a4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 1 Feb 2016 08:34:55 +0100 Subject: [PATCH] * Add a parse_batch method to Parser, that releases the GIL around a batch of documents. --- spacy/syntax/_parse_features.pxd | 3 +- spacy/syntax/_parse_features.pyx | 3 +- spacy/syntax/arc_eager.pyx | 22 +++---- spacy/syntax/parser.pxd | 5 +- spacy/syntax/parser.pyx | 96 +++++++++++++++++++++--------- spacy/syntax/transition_system.pxd | 4 +- spacy/syntax/transition_system.pyx | 4 +- 7 files changed, 89 insertions(+), 48 deletions(-) diff --git a/spacy/syntax/_parse_features.pxd b/spacy/syntax/_parse_features.pxd index 191d41ef5..0842e3504 100644 --- a/spacy/syntax/_parse_features.pxd +++ b/spacy/syntax/_parse_features.pxd @@ -1,9 +1,10 @@ from thinc.typedefs cimport atom_t from .stateclass cimport StateClass +from ._state cimport StateC -cdef int fill_context(atom_t* context, StateClass state) nogil +cdef int fill_context(atom_t* context, const StateC* state) nogil # Context elements # Ensure each token's attributes are listed: w, p, c, c6, c4. The order diff --git a/spacy/syntax/_parse_features.pyx b/spacy/syntax/_parse_features.pyx index 01aa31ba0..bc54e0c9d 100644 --- a/spacy/syntax/_parse_features.pyx +++ b/spacy/syntax/_parse_features.pyx @@ -14,6 +14,7 @@ from itertools import combinations from ..structs cimport TokenC from .stateclass cimport StateClass +from ._state cimport StateC from cymem.cymem cimport Pool @@ -59,7 +60,7 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil: context[10] = token.ent_iob context[11] = token.ent_type -cdef int fill_context(atom_t* ctxt, StateClass st) nogil: +cdef int fill_context(atom_t* ctxt, const StateC* st) nogil: # Take care to fill every element of context! # We could memset, but this makes it very easy to have broken features that # make almost no impact on accuracy. If instead they're unset, the impact diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index b6a94e5d4..b92b66230 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -377,23 +377,23 @@ cdef class ArcEager(TransitionSystem): raise Exception(move) return t - cdef int initialize_state(self, StateClass st) except -1: + cdef int initialize_state(self, StateC* st) nogil: # Ensure sent_start is set to 0 throughout - for i in range(st.c.length): - st.c._sent[i].sent_start = False - st.c._sent[i].l_edge = i - st.c._sent[i].r_edge = i + for i in range(st.length): + st._sent[i].sent_start = False + st._sent[i].l_edge = i + st._sent[i].r_edge = i st.fast_forward() - cdef int finalize_state(self, StateClass st) nogil: + cdef int finalize_state(self, StateC* st) nogil: cdef int i - for i in range(st.c.length): - if st.c._sent[i].head == 0 and st.c._sent[i].dep == 0: - st.c._sent[i].dep = self.root_label + for i in range(st.length): + if st._sent[i].head == 0 and st._sent[i].dep == 0: + st._sent[i].dep = self.root_label # If we're not using the Break transition, we segment via root-labelled # arcs between the root words. - elif USE_ROOT_ARC_SEGMENT and st.c._sent[i].dep == self.root_label: - st.c._sent[i].head = 0 + elif USE_ROOT_ARC_SEGMENT and st._sent[i].dep == self.root_label: + st._sent[i].head = 0 cdef int set_valid(self, int* output, const StateC* st) nogil: cdef bint[N_MOVES] is_valid diff --git a/spacy/syntax/parser.pxd b/spacy/syntax/parser.pxd index 33a7ffd8c..1b1495406 100644 --- a/spacy/syntax/parser.pxd +++ b/spacy/syntax/parser.pxd @@ -6,14 +6,15 @@ from .stateclass cimport StateClass from .arc_eager cimport TransitionSystem from ..tokens.doc cimport Doc from ..structs cimport TokenC +from ._state cimport StateC cdef class ParserModel(AveragedPerceptron): - cdef void set_featuresC(self, ExampleC* eg, StateClass stcls) nogil + cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil cdef class Parser: cdef readonly ParserModel model cdef readonly TransitionSystem moves - cdef void parseC(self, Doc tokens, StateClass stcls, Example eg) nogil + cdef void parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 55d5dcc3f..f44b47d0e 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -1,3 +1,4 @@ +# cython: infer_types=True """ MALT-style dependency parser """ @@ -9,6 +10,7 @@ from cpython.exc cimport PyErr_CheckSignals from libc.stdint cimport uint32_t, uint64_t from libc.string cimport memset, memcpy +from libc.stdlib cimport malloc, calloc, free import random import os.path from os import path @@ -21,6 +23,7 @@ from murmurhash.mrmr cimport hash64 from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t from thinc.linear.avgtron cimport AveragedPerceptron from thinc.linalg cimport VecVec +from thinc.structs cimport FeatureC from util import Config @@ -38,6 +41,7 @@ from . import _parse_features from ._parse_features cimport CONTEXT_SIZE from ._parse_features cimport fill_context from .stateclass cimport StateClass +from ._state cimport StateC @@ -65,8 +69,8 @@ def ParserFactory(transition_system): cdef class ParserModel(AveragedPerceptron): - cdef void set_featuresC(self, ExampleC* eg, StateClass stcls) nogil: - fill_context(eg.atoms, stcls) + cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil: + fill_context(eg.atoms, state) eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms) @@ -99,43 +103,77 @@ cdef class Parser: return (Parser, (self.moves.strings, self.moves, self.model), None, None) def __call__(self, Doc tokens): - cdef StateClass stcls = StateClass.init(tokens.c, tokens.length) - self.moves.initialize_state(stcls) - - cdef Example eg = Example( - nr_class=self.moves.n_moves, - nr_atom=CONTEXT_SIZE, - nr_feat=self.model.nr_feat) + cdef int nr_class = self.moves.n_moves + cdef int nr_feat = self.model.nr_feat with nogil: - self.parseC(tokens, stcls, eg) + self.parseC(tokens.c, tokens.length, nr_feat, nr_class) + tokens.is_parsed = True # Check for KeyboardInterrupt etc. Untested PyErr_CheckSignals() - cdef void parseC(self, Doc tokens, StateClass stcls, Example eg) nogil: - while not stcls.is_final(): - self.model.set_featuresC(&eg.c, stcls) - self.moves.set_valid(eg.c.is_valid, stcls.c) - self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat) + def parse_batch(self, batch): + cdef TokenC** doc_ptr = calloc(len(batch), sizeof(TokenC*)) + cdef int* lengths = calloc(len(batch), sizeof(int)) + cdef Doc doc + cdef int i + for i, doc in enumerate(batch): + doc_ptr[i] = doc.c + lengths[i] = doc.length + cdef int nr_class = self.moves.n_moves + cdef int nr_feat = self.model.nr_feat + cdef int nr_doc = len(batch) + with nogil: + for i in range(nr_doc): + self.parseC(doc_ptr[i], lengths[i], nr_feat, nr_class) + for doc in batch: + doc.is_parsed = True + # Check for KeyboardInterrupt etc. Untested + PyErr_CheckSignals() + free(doc_ptr) + free(lengths) - guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class) + cdef void parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil: + cdef ExampleC eg + eg.nr_feat = nr_feat + eg.nr_atom = CONTEXT_SIZE + eg.nr_class = nr_class + eg.features = calloc(sizeof(FeatureC), nr_feat) + eg.atoms = calloc(sizeof(atom_t), CONTEXT_SIZE) + eg.scores = calloc(sizeof(weight_t), nr_class) + eg.is_valid = calloc(sizeof(int), nr_class) + state = new StateC(tokens, length) + self.moves.initialize_state(state) + cdef int i + while not state.is_final(): + self.model.set_featuresC(&eg, state) + self.moves.set_valid(eg.is_valid, state) + self.model.set_scoresC(eg.scores, eg.features, eg.nr_feat) + + guess = VecVec.arg_max_if_true(eg.scores, eg.is_valid, eg.nr_class) action = self.moves.c[guess] - if not eg.c.is_valid[guess]: + if not eg.is_valid[guess]: with gil: move_name = self.moves.move_name(action.move, action.label) raise ValueError("Illegal action: %s" % move_name) - action.do(stcls.c, action.label) - memset(eg.c.scores, 0, sizeof(eg.c.scores[0]) * eg.c.nr_class) - memset(eg.c.costs, 0, sizeof(eg.c.costs[0]) * eg.c.nr_class) - for i in range(eg.c.nr_class): - eg.c.is_valid[i] = 1 - self.moves.finalize_state(stcls) - tokens.set_parse(stcls.c._sent) + action.do(state, action.label) + memset(eg.scores, 0, sizeof(eg.scores[0]) * eg.nr_class) + memset(eg.costs, 0, sizeof(eg.costs[0]) * eg.nr_class) + for i in range(eg.nr_class): + eg.is_valid[i] = 1 + self.moves.finalize_state(state) + for i in range(length): + tokens[i] = state._sent[i] + del state + free(eg.features) + free(eg.atoms) + free(eg.scores) + free(eg.is_valid) def train(self, Doc tokens, GoldParse gold): self.moves.preprocess_gold(gold) cdef StateClass stcls = StateClass.init(tokens.c, tokens.length) - self.moves.initialize_state(stcls) + self.moves.initialize_state(stcls.c) cdef Pool mem = Pool() cdef Example eg = Example( nr_class=self.moves.n_moves, @@ -144,7 +182,7 @@ cdef class Parser: cdef weight_t loss = 0 cdef Transition action while not stcls.is_final(): - self.model.set_featuresC(&eg.c, stcls) + self.model.set_featuresC(&eg.c, stcls.c) self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold) self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat) self.model.updateC(&eg.c) @@ -174,7 +212,7 @@ cdef class StepwiseState: self.parser = parser self.doc = doc self.stcls = StateClass.init(doc.c, doc.length) - self.parser.moves.initialize_state(self.stcls) + self.parser.moves.initialize_state(self.stcls.c) self.eg = Example( nr_class=self.parser.moves.n_moves, nr_atom=CONTEXT_SIZE, @@ -209,7 +247,7 @@ cdef class StepwiseState: def predict(self): self.eg.reset() - self.parser.model.set_featuresC(&self.eg.c, self.stcls) + self.parser.model.set_featuresC(&self.eg.c, self.stcls.c) self.parser.moves.set_valid(self.eg.c.is_valid, self.stcls.c) self.parser.model.set_scoresC(self.eg.c.scores, self.eg.c.features, self.eg.c.nr_feat) @@ -234,7 +272,7 @@ cdef class StepwiseState: def finish(self): if self.stcls.is_final(): - self.parser.moves.finalize_state(self.stcls) + self.parser.moves.finalize_state(self.stcls.c) self.doc.set_parse(self.stcls.c._sent) diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd index d9dbe454d..b985498dc 100644 --- a/spacy/syntax/transition_system.pxd +++ b/spacy/syntax/transition_system.pxd @@ -38,8 +38,8 @@ cdef class TransitionSystem: cdef public int root_label cdef public freqs - cdef int initialize_state(self, StateClass state) except -1 - cdef int finalize_state(self, StateClass state) nogil + cdef int initialize_state(self, StateC* state) nogil + cdef int finalize_state(self, StateC* state) nogil cdef int preprocess_gold(self, GoldParse gold) except -1 diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 4228e8e67..2a7ac9523 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -47,10 +47,10 @@ cdef class TransitionSystem: (self.strings, labels_by_action, self.freqs), None, None) - cdef int initialize_state(self, StateClass state) except -1: + cdef int initialize_state(self, StateC* state) nogil: pass - cdef int finalize_state(self, StateClass state) nogil: + cdef int finalize_state(self, StateC* state) nogil: pass cdef int preprocess_gold(self, GoldParse gold) except -1: