From 1b41f868d2c08010240912d0f01495c758534cdd Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 6 Feb 2016 10:06:13 +0100 Subject: [PATCH] * Check for errors in parser, and parallelise the left-over batch --- spacy/syntax/parser.pxd | 3 +-- spacy/syntax/parser.pyx | 31 +++++++++++++++++++++---------- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/spacy/syntax/parser.pxd b/spacy/syntax/parser.pxd index 1b1495406..77ea376a1 100644 --- a/spacy/syntax/parser.pxd +++ b/spacy/syntax/parser.pxd @@ -12,9 +12,8 @@ from ._state cimport StateC cdef class ParserModel(AveragedPerceptron): cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil - cdef class Parser: cdef readonly ParserModel model cdef readonly TransitionSystem moves - cdef void parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil + cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 6cfa57c60..6c77c6c96 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -123,29 +123,39 @@ cdef class Parser: cdef int i cdef int nr_class = self.moves.n_moves cdef int nr_feat = self.model.nr_feat + cdef int status queue = [] for doc in stream: doc_ptr[len(queue)] = doc.c lengths[len(queue)] = doc.length queue.append(doc) if len(queue) == batch_size: - for i in cython.parallel.prange(batch_size, nogil=True, - num_threads=n_threads): - self.parseC(doc_ptr[i], lengths[i], nr_feat, nr_class) + with nogil: + for i in cython.parallel.prange(batch_size, num_threads=n_threads): + status = self.parseC(doc_ptr[i], lengths[i], nr_feat, nr_class) + if status != 0: + with gil: + sent_str = queue[i].text + raise ValueError("Error parsing doc: %s" % sent_str) PyErr_CheckSignals() for doc in queue: doc.is_parsed = True yield doc queue = [] batch_size = len(queue) - for i in range(batch_size): - self.parseC(doc_ptr[i], lengths[i], nr_feat, nr_class) - for doc in queue: - doc.is_parsed = True - yield doc + with nogil: + for i in cython.parallel.prange(batch_size, num_threads=n_threads): + status = self.parseC(doc_ptr[i], lengths[i], nr_feat, nr_class) + if status != 0: + with gil: + sent_str = queue[i].text + raise ValueError("Error parsing doc: %s" % sent_str) + for doc in queue: + doc.is_parsed = True + yield doc PyErr_CheckSignals() - cdef void parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil: + cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil: cdef ExampleC eg eg.nr_feat = nr_feat eg.nr_atom = CONTEXT_SIZE @@ -168,7 +178,7 @@ cdef class Parser: if not eg.is_valid[guess]: with gil: move_name = self.moves.move_name(action.move, action.label) - raise ValueError("Illegal action: %s" % move_name) + return 1 action.do(state, action.label) memset(eg.scores, 0, sizeof(eg.scores[0]) * eg.nr_class) for i in range(eg.nr_class): @@ -181,6 +191,7 @@ cdef class Parser: free(eg.atoms) free(eg.scores) free(eg.is_valid) + return 0 def train(self, Doc tokens, GoldParse gold): self.moves.preprocess_gold(gold)