* Fix POS and dependency label tag names. Add parse and string navigation functions.

2015-01-24 17:29:04 +11:00 · 2015-01-24 17:29:04 +11:00 · a97bed9359
parent cb6a526fcd
commit a97bed9359
6 changed files with 64 additions and 10 deletions
--- a/spacy/en/pos.pyx
+++ b/spacy/en/pos.pyx
@ -252,7 +252,7 @@ cdef class EnPosTagger:
                scores = self.model.score(context)
                tokens.data[i].tag = arg_max(scores, self.model.n_classes)
                self.set_morph(i, tokens.data)
-        tokens.pos_scheme = self.tag_map
+        tokens._tag_strings = self.tag_names

    def train(self, Tokens tokens, object golds):
        cdef int i
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 from ._state cimport State
 from ._state cimport has_head, get_idx, get_s0, get_n0
 from ._state cimport is_final, at_eol, pop_stack, push_stack, add_dep
@ -106,12 +108,14 @@ cdef class TransitionSystem:
        self.label_ids = {'ROOT': 0}
        cdef int label_id
        for label_str in left_labels:
+            label_str = unicode(label_str)
            label_id = self.label_ids.setdefault(label_str, len(self.label_ids))
            moves[i].move = LEFT
            moves[i].label = label_id
            moves[i].clas = i
            i += 1
        for label_str in right_labels:
+            label_str = unicode(label_str)
            label_id = self.label_ids.setdefault(label_str, len(self.label_ids))
            moves[i].move = RIGHT
            moves[i].label = label_id
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@ -79,6 +79,10 @@ cdef class GreedyParser:
            scores = self.model.score(context)
            guess = self.moves.best_valid(scores, state)
            self.moves.transition(state, &guess)
+        # Messily tell Tokens object the string names of the dependency labels
+        tokens._dep_strings = [None] * len(self.moves.label_ids)
+        for label, id_ in self.moves.label_ids.items():
+            tokens._dep_strings[id_] = label
        return 0

    def train_sent(self, Tokens tokens, list gold_heads, list gold_labels):
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -1,6 +1,7 @@
 from libc.stdint cimport uint32_t

 from numpy cimport ndarray
+cimport numpy

 from cymem.cymem cimport Pool
 from thinc.typedefs cimport atom_t
@ -29,11 +30,13 @@ cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
 cdef class Tokens:
    cdef Pool mem
    cdef Vocab vocab
-    cdef list tag_names
-    cdef dict pos_scheme
-
+    
    cdef TokenC* data
+    
+
    cdef unicode _string
+    cdef list _tag_strings
+    cdef list _dep_strings

    cdef int length
    cdef int max_length
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -1,5 +1,4 @@
 # cython: embedsignature=True
-from cython.view cimport array as cvarray

 from preshed.maps cimport PreshMap
 from preshed.counter cimport PreshCounter
@ -9,6 +8,7 @@ from .typedefs cimport attr_id_t, attr_t
 from .typedefs cimport LEMMA
 from .typedefs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
 from .typedefs cimport POS, LEMMA
+from .typedefs import UNIV_TAG_NAMES

 from unidecode import unidecode

@ -84,6 +84,8 @@ cdef class Tokens:
        self.data = data_start + PADDING
        self.max_length = size
        self.length = 0
+        self._tag_strings = [] # These will be set by the POS tagger and parser
+        self._dep_strings = [] # The strings are arbitrary and model-specific.

    def sentences(self):
        cdef int i
@ -148,7 +150,7 @@ cdef class Tokens:
        return idx + t.lex.length

    @cython.boundscheck(False)
-    cpdef long[:,:] to_array(self, object attr_ids):
+    cpdef long[:,:] to_array(self, object py_attr_ids):
        """Given a list of M attribute IDs, export the tokens to a numpy ndarray
        of shape N*M, where N is the length of the sentence.

@ -162,8 +164,11 @@ cdef class Tokens:
        """
        cdef int i, j
        cdef attr_id_t feature
-        cdef long[:,:] output = cvarray(shape=(self.length, len(attr_ids)),
-                                        itemsize=sizeof(long), format="l")
+        cdef numpy.ndarray[long, ndim=2] output
+        # Make an array from the attributes --- otherwise our inner loop is Python
+        # dict iteration.
+        cdef numpy.ndarray[long, ndim=1] attr_ids = numpy.asarray(py_attr_ids)
+        output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int)
        for i in range(self.length):
            for j, feature in enumerate(attr_ids):
                output[i, j] = get_token_attr(&self.data[i], feature)
@ -232,6 +237,7 @@ cdef class Token:
        self.sentiment = t.lex.sentiment
        self.flags = t.lex.flags
        self.lemma = t.lemma
+        self.pos = t.pos
        self.tag = t.tag
        self.dep = t.dep
        self.repvec = numpy.asarray(<float[:300,]> t.lex.repvec)
@ -248,6 +254,24 @@ cdef class Token:
        """
        return self._seq.data[self.i].lex.length

+    def nbor(self, int i=1):
+        return Token(self._seq, self.i + i)
+
+    def child(self, int i=1):
+        cdef const TokenC* t = &self._seq.data[self.i]
+        if i == 0:
+            return self
+        elif i >= 1:
+            if t.r_kids == 0:
+                return None
+            else:
+                return Token(self._seq, _nth_significant_bit(t.r_kids, i))
+        else:
+            if t.l_kids == 0:
+                return None
+            else:
+                return Token(self._seq, _nth_significant_bit(t.l_kids, i))
+        
    property head:
        """The token predicted by the parser to be the head of the current token."""
        def __get__(self):
@ -290,10 +314,26 @@ cdef class Token:
            cdef unicode py_ustr = self._seq.vocab.strings[t.lemma]
            return py_ustr

+    property pos_:
+        def __get__(self):
+            id_to_string = {id_: string for string, id_ in UNIV_TAG_NAMES.items()}
+            return id_to_string[self.pos]
+
    property tag_:
        def __get__(self):
-            return self._seq.tag_names[self.tag]
+            return self._seq._tag_strings[self.tag]

    property dep_:
        def __get__(self):
-            return self._seq.dep_names[self.dep]
+            return self._seq._dep_strings[self.dep]
+
+
+
+cdef inline uint32_t _nth_significant_bit(uint32_t bits, int n) nogil:
+    cdef int i
+    for i in range(32):
+        if bits & (1 << i):
+            n -= 1
+            if n < 1:
+                return i
+    return 0
--- a/spacy/typedefs.pyx
+++ b/spacy/typedefs.pyx
@ -1,3 +1,6 @@
+from __future__ import unicode_literals
+
+
 UNIV_TAG_NAMES = {
    "NO_TAG": NO_TAG,
    "ADJ": ADJ,