From a97bed93591458cc28547bad27d0ed85edd12aa1 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 24 Jan 2015 17:29:04 +1100
Subject: [PATCH] * Fix POS and dependency label tag names.  Add parse and
 string navigation functions.

---
 spacy/en/pos.pyx           |  2 +-
 spacy/syntax/arc_eager.pyx |  4 +++
 spacy/syntax/parser.pyx    |  4 +++
 spacy/tokens.pxd           |  9 ++++---
 spacy/tokens.pyx           | 52 +++++++++++++++++++++++++++++++++-----
 spacy/typedefs.pyx         |  3 +++
 6 files changed, 64 insertions(+), 10 deletions(-)

diff --git a/spacy/en/pos.pyx b/spacy/en/pos.pyx
index 21cc835cd..38c0b1ae4 100644
--- a/spacy/en/pos.pyx
+++ b/spacy/en/pos.pyx
@@ -252,7 +252,7 @@ cdef class EnPosTagger:
                 scores = self.model.score(context)
                 tokens.data[i].tag = arg_max(scores, self.model.n_classes)
                 self.set_morph(i, tokens.data)
-        tokens.pos_scheme = self.tag_map
+        tokens._tag_strings = self.tag_names
 
     def train(self, Tokens tokens, object golds):
         cdef int i
diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index 18558fa40..c1c449b50 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 from ._state cimport State
 from ._state cimport has_head, get_idx, get_s0, get_n0
 from ._state cimport is_final, at_eol, pop_stack, push_stack, add_dep
@@ -106,12 +108,14 @@ cdef class TransitionSystem:
         self.label_ids = {'ROOT': 0}
         cdef int label_id
         for label_str in left_labels:
+            label_str = unicode(label_str)
             label_id = self.label_ids.setdefault(label_str, len(self.label_ids))
             moves[i].move = LEFT
             moves[i].label = label_id
             moves[i].clas = i
             i += 1
         for label_str in right_labels:
+            label_str = unicode(label_str)
             label_id = self.label_ids.setdefault(label_str, len(self.label_ids))
             moves[i].move = RIGHT
             moves[i].label = label_id
diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx
index 90ad9bf61..d015cd067 100644
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@@ -79,6 +79,10 @@ cdef class GreedyParser:
             scores = self.model.score(context)
             guess = self.moves.best_valid(scores, state)
             self.moves.transition(state, &guess)
+        # Messily tell Tokens object the string names of the dependency labels
+        tokens._dep_strings = [None] * len(self.moves.label_ids)
+        for label, id_ in self.moves.label_ids.items():
+            tokens._dep_strings[id_] = label
         return 0
 
     def train_sent(self, Tokens tokens, list gold_heads, list gold_labels):
diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd
index 873819706..4a2b8ab13 100644
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@@ -1,6 +1,7 @@
 from libc.stdint cimport uint32_t
 
 from numpy cimport ndarray
+cimport numpy
 
 from cymem.cymem cimport Pool
 from thinc.typedefs cimport atom_t
@@ -29,11 +30,13 @@ cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
 cdef class Tokens:
     cdef Pool mem
     cdef Vocab vocab
-    cdef list tag_names
-    cdef dict pos_scheme
-
+    
     cdef TokenC* data
+    
+
     cdef unicode _string
+    cdef list _tag_strings
+    cdef list _dep_strings
 
     cdef int length
     cdef int max_length
diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx
index b434ee854..849485077 100644
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@@ -1,5 +1,4 @@
 # cython: embedsignature=True
-from cython.view cimport array as cvarray
 
 from preshed.maps cimport PreshMap
 from preshed.counter cimport PreshCounter
@@ -9,6 +8,7 @@ from .typedefs cimport attr_id_t, attr_t
 from .typedefs cimport LEMMA
 from .typedefs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
 from .typedefs cimport POS, LEMMA
+from .typedefs import UNIV_TAG_NAMES
 
 from unidecode import unidecode
 
@@ -84,6 +84,8 @@ cdef class Tokens:
         self.data = data_start + PADDING
         self.max_length = size
         self.length = 0
+        self._tag_strings = [] # These will be set by the POS tagger and parser
+        self._dep_strings = [] # The strings are arbitrary and model-specific.
 
     def sentences(self):
         cdef int i
@@ -148,7 +150,7 @@ cdef class Tokens:
         return idx + t.lex.length
 
     @cython.boundscheck(False)
-    cpdef long[:,:] to_array(self, object attr_ids):
+    cpdef long[:,:] to_array(self, object py_attr_ids):
         """Given a list of M attribute IDs, export the tokens to a numpy ndarray
         of shape N*M, where N is the length of the sentence.
 
@@ -162,8 +164,11 @@ cdef class Tokens:
         """
         cdef int i, j
         cdef attr_id_t feature
-        cdef long[:,:] output = cvarray(shape=(self.length, len(attr_ids)),
-                                        itemsize=sizeof(long), format="l")
+        cdef numpy.ndarray[long, ndim=2] output
+        # Make an array from the attributes --- otherwise our inner loop is Python
+        # dict iteration.
+        cdef numpy.ndarray[long, ndim=1] attr_ids = numpy.asarray(py_attr_ids)
+        output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int)
         for i in range(self.length):
             for j, feature in enumerate(attr_ids):
                 output[i, j] = get_token_attr(&self.data[i], feature)
@@ -232,6 +237,7 @@ cdef class Token:
         self.sentiment = t.lex.sentiment
         self.flags = t.lex.flags
         self.lemma = t.lemma
+        self.pos = t.pos
         self.tag = t.tag
         self.dep = t.dep
         self.repvec = numpy.asarray(<float[:300,]> t.lex.repvec)
@@ -248,6 +254,24 @@ cdef class Token:
         """
         return self._seq.data[self.i].lex.length
 
+    def nbor(self, int i=1):
+        return Token(self._seq, self.i + i)
+
+    def child(self, int i=1):
+        cdef const TokenC* t = &self._seq.data[self.i]
+        if i == 0:
+            return self
+        elif i >= 1:
+            if t.r_kids == 0:
+                return None
+            else:
+                return Token(self._seq, _nth_significant_bit(t.r_kids, i))
+        else:
+            if t.l_kids == 0:
+                return None
+            else:
+                return Token(self._seq, _nth_significant_bit(t.l_kids, i))
+        
     property head:
         """The token predicted by the parser to be the head of the current token."""
         def __get__(self):
@@ -290,10 +314,26 @@ cdef class Token:
             cdef unicode py_ustr = self._seq.vocab.strings[t.lemma]
             return py_ustr
 
+    property pos_:
+        def __get__(self):
+            id_to_string = {id_: string for string, id_ in UNIV_TAG_NAMES.items()}
+            return id_to_string[self.pos]
+
     property tag_:
         def __get__(self):
-            return self._seq.tag_names[self.tag]
+            return self._seq._tag_strings[self.tag]
 
     property dep_:
         def __get__(self):
-            return self._seq.dep_names[self.dep]
+            return self._seq._dep_strings[self.dep]
+
+
+
+cdef inline uint32_t _nth_significant_bit(uint32_t bits, int n) nogil:
+    cdef int i
+    for i in range(32):
+        if bits & (1 << i):
+            n -= 1
+            if n < 1:
+                return i
+    return 0
diff --git a/spacy/typedefs.pyx b/spacy/typedefs.pyx
index 00a8b8eea..020660f0c 100644
--- a/spacy/typedefs.pyx
+++ b/spacy/typedefs.pyx
@@ -1,3 +1,6 @@
+from __future__ import unicode_literals
+
+
 UNIV_TAG_NAMES = {
     "NO_TAG": NO_TAG,
     "ADJ": ADJ,