diff --git a/spacy/en/pos.pyx b/spacy/en/pos.pyx index 21cc835cd..38c0b1ae4 100644 --- a/spacy/en/pos.pyx +++ b/spacy/en/pos.pyx @@ -252,7 +252,7 @@ cdef class EnPosTagger: scores = self.model.score(context) tokens.data[i].tag = arg_max(scores, self.model.n_classes) self.set_morph(i, tokens.data) - tokens.pos_scheme = self.tag_map + tokens._tag_strings = self.tag_names def train(self, Tokens tokens, object golds): cdef int i diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 18558fa40..c1c449b50 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + from ._state cimport State from ._state cimport has_head, get_idx, get_s0, get_n0 from ._state cimport is_final, at_eol, pop_stack, push_stack, add_dep @@ -106,12 +108,14 @@ cdef class TransitionSystem: self.label_ids = {'ROOT': 0} cdef int label_id for label_str in left_labels: + label_str = unicode(label_str) label_id = self.label_ids.setdefault(label_str, len(self.label_ids)) moves[i].move = LEFT moves[i].label = label_id moves[i].clas = i i += 1 for label_str in right_labels: + label_str = unicode(label_str) label_id = self.label_ids.setdefault(label_str, len(self.label_ids)) moves[i].move = RIGHT moves[i].label = label_id diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 90ad9bf61..d015cd067 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -79,6 +79,10 @@ cdef class GreedyParser: scores = self.model.score(context) guess = self.moves.best_valid(scores, state) self.moves.transition(state, &guess) + # Messily tell Tokens object the string names of the dependency labels + tokens._dep_strings = [None] * len(self.moves.label_ids) + for label, id_ in self.moves.label_ids.items(): + tokens._dep_strings[id_] = label return 0 def train_sent(self, Tokens tokens, list gold_heads, list gold_labels): diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index 873819706..4a2b8ab13 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -1,6 +1,7 @@ from libc.stdint cimport uint32_t from numpy cimport ndarray +cimport numpy from cymem.cymem cimport Pool from thinc.typedefs cimport atom_t @@ -29,11 +30,13 @@ cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil: cdef class Tokens: cdef Pool mem cdef Vocab vocab - cdef list tag_names - cdef dict pos_scheme - + cdef TokenC* data + + cdef unicode _string + cdef list _tag_strings + cdef list _dep_strings cdef int length cdef int max_length diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index b434ee854..849485077 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -1,5 +1,4 @@ # cython: embedsignature=True -from cython.view cimport array as cvarray from preshed.maps cimport PreshMap from preshed.counter cimport PreshCounter @@ -9,6 +8,7 @@ from .typedefs cimport attr_id_t, attr_t from .typedefs cimport LEMMA from .typedefs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from .typedefs cimport POS, LEMMA +from .typedefs import UNIV_TAG_NAMES from unidecode import unidecode @@ -84,6 +84,8 @@ cdef class Tokens: self.data = data_start + PADDING self.max_length = size self.length = 0 + self._tag_strings = [] # These will be set by the POS tagger and parser + self._dep_strings = [] # The strings are arbitrary and model-specific. def sentences(self): cdef int i @@ -148,7 +150,7 @@ cdef class Tokens: return idx + t.lex.length @cython.boundscheck(False) - cpdef long[:,:] to_array(self, object attr_ids): + cpdef long[:,:] to_array(self, object py_attr_ids): """Given a list of M attribute IDs, export the tokens to a numpy ndarray of shape N*M, where N is the length of the sentence. @@ -162,8 +164,11 @@ cdef class Tokens: """ cdef int i, j cdef attr_id_t feature - cdef long[:,:] output = cvarray(shape=(self.length, len(attr_ids)), - itemsize=sizeof(long), format="l") + cdef numpy.ndarray[long, ndim=2] output + # Make an array from the attributes --- otherwise our inner loop is Python + # dict iteration. + cdef numpy.ndarray[long, ndim=1] attr_ids = numpy.asarray(py_attr_ids) + output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int) for i in range(self.length): for j, feature in enumerate(attr_ids): output[i, j] = get_token_attr(&self.data[i], feature) @@ -232,6 +237,7 @@ cdef class Token: self.sentiment = t.lex.sentiment self.flags = t.lex.flags self.lemma = t.lemma + self.pos = t.pos self.tag = t.tag self.dep = t.dep self.repvec = numpy.asarray( t.lex.repvec) @@ -248,6 +254,24 @@ cdef class Token: """ return self._seq.data[self.i].lex.length + def nbor(self, int i=1): + return Token(self._seq, self.i + i) + + def child(self, int i=1): + cdef const TokenC* t = &self._seq.data[self.i] + if i == 0: + return self + elif i >= 1: + if t.r_kids == 0: + return None + else: + return Token(self._seq, _nth_significant_bit(t.r_kids, i)) + else: + if t.l_kids == 0: + return None + else: + return Token(self._seq, _nth_significant_bit(t.l_kids, i)) + property head: """The token predicted by the parser to be the head of the current token.""" def __get__(self): @@ -290,10 +314,26 @@ cdef class Token: cdef unicode py_ustr = self._seq.vocab.strings[t.lemma] return py_ustr + property pos_: + def __get__(self): + id_to_string = {id_: string for string, id_ in UNIV_TAG_NAMES.items()} + return id_to_string[self.pos] + property tag_: def __get__(self): - return self._seq.tag_names[self.tag] + return self._seq._tag_strings[self.tag] property dep_: def __get__(self): - return self._seq.dep_names[self.dep] + return self._seq._dep_strings[self.dep] + + + +cdef inline uint32_t _nth_significant_bit(uint32_t bits, int n) nogil: + cdef int i + for i in range(32): + if bits & (1 << i): + n -= 1 + if n < 1: + return i + return 0 diff --git a/spacy/typedefs.pyx b/spacy/typedefs.pyx index 00a8b8eea..020660f0c 100644 --- a/spacy/typedefs.pyx +++ b/spacy/typedefs.pyx @@ -1,3 +1,6 @@ +from __future__ import unicode_literals + + UNIV_TAG_NAMES = { "NO_TAG": NO_TAG, "ADJ": ADJ,