From 4efb3919940da57965152a538a58cfc05217d513 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 May 2017 18:45:18 +0200 Subject: [PATCH] Fix serializer --- spacy/tokens/doc.pyx | 51 ++++++++++++++++++-------------------------- 1 file changed, 21 insertions(+), 30 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 6f397f963..515228d8d 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1,4 +1,6 @@ # coding: utf8 +# cython: infer_types=True +# cython: bounds_check=False from __future__ import unicode_literals cimport cython @@ -565,7 +567,7 @@ cdef class Doc: for i in range(self.length): self.c[i] = parsed[i] - def from_array(self, attrs, array): + def from_array(self, attrs, int[:, :] array): """ Write to a `Doc` object, from an `(M, N)` array of attributes. """ @@ -573,34 +575,23 @@ cdef class Doc: cdef attr_id_t attr_id cdef TokenC* tokens = self.c cdef int length = len(array) - cdef attr_t[:] values + # Get set up for fast loading + cdef Pool mem = Pool() + cdef int n_attrs = len(attrs) + attr_ids = mem.alloc(n_attrs, sizeof(attr_id_t)) + for i, attr_id in enumerate(attrs): + attr_ids[i] = attr_id + # Now load the data + for i in range(self.length): + token = &self.c[i] + for j in range(n_attrs): + Token.set_struct_attr(token, attr_ids[j], array[i, j]) + # Auxiliary loading logic for col, attr_id in enumerate(attrs): - values = array[:, col] - if attr_id == HEAD: + if attr_id == TAG: for i in range(length): - tokens[i].head = values[i] - if values[i] >= 1: - tokens[i + values[i]].l_kids += 1 - elif values[i] < 0: - tokens[i + values[i]].r_kids += 1 - elif attr_id == TAG: - for i in range(length): - if values[i] != 0: - self.vocab.morphology.assign_tag(&tokens[i], values[i]) - elif attr_id == POS: - for i in range(length): - tokens[i].pos = values[i] - elif attr_id == DEP: - for i in range(length): - tokens[i].dep = values[i] - elif attr_id == ENT_IOB: - for i in range(length): - tokens[i].ent_iob = values[i] - elif attr_id == ENT_TYPE: - for i in range(length): - tokens[i].ent_type = values[i] - else: - raise ValueError("Unknown attribute ID: %d" % attr_id) + if array[i, col] != 0: + self.vocab.morphology.assign_tag(&tokens[i], array[i, col]) set_children_from_heads(self.c, self.length) self.is_parsed = bool(HEAD in attrs or DEP in attrs) self.is_tagged = bool(TAG in attrs or POS in attrs) @@ -645,9 +636,9 @@ cdef class Doc: self.push_back(lex, has_space) start = end + has_space - self.from_array(attrs[:, 2:], - [TAG,LEMMA,HEAD,DEP,ENT_IOB,ENT_TYPE]) - + self.from_array([TAG,LEMMA,HEAD,DEP,ENT_IOB,ENT_TYPE], + attrs[:, 2:]) + return self def merge(self, int start_idx, int end_idx, *args, **attributes): """