mirror of https://github.com/explosion/spaCy.git
* Fix POS and dependency label tag names. Add parse and string navigation functions.
This commit is contained in:
parent
cb6a526fcd
commit
a97bed9359
|
@ -252,7 +252,7 @@ cdef class EnPosTagger:
|
|||
scores = self.model.score(context)
|
||||
tokens.data[i].tag = arg_max(scores, self.model.n_classes)
|
||||
self.set_morph(i, tokens.data)
|
||||
tokens.pos_scheme = self.tag_map
|
||||
tokens._tag_strings = self.tag_names
|
||||
|
||||
def train(self, Tokens tokens, object golds):
|
||||
cdef int i
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from ._state cimport State
|
||||
from ._state cimport has_head, get_idx, get_s0, get_n0
|
||||
from ._state cimport is_final, at_eol, pop_stack, push_stack, add_dep
|
||||
|
@ -106,12 +108,14 @@ cdef class TransitionSystem:
|
|||
self.label_ids = {'ROOT': 0}
|
||||
cdef int label_id
|
||||
for label_str in left_labels:
|
||||
label_str = unicode(label_str)
|
||||
label_id = self.label_ids.setdefault(label_str, len(self.label_ids))
|
||||
moves[i].move = LEFT
|
||||
moves[i].label = label_id
|
||||
moves[i].clas = i
|
||||
i += 1
|
||||
for label_str in right_labels:
|
||||
label_str = unicode(label_str)
|
||||
label_id = self.label_ids.setdefault(label_str, len(self.label_ids))
|
||||
moves[i].move = RIGHT
|
||||
moves[i].label = label_id
|
||||
|
|
|
@ -79,6 +79,10 @@ cdef class GreedyParser:
|
|||
scores = self.model.score(context)
|
||||
guess = self.moves.best_valid(scores, state)
|
||||
self.moves.transition(state, &guess)
|
||||
# Messily tell Tokens object the string names of the dependency labels
|
||||
tokens._dep_strings = [None] * len(self.moves.label_ids)
|
||||
for label, id_ in self.moves.label_ids.items():
|
||||
tokens._dep_strings[id_] = label
|
||||
return 0
|
||||
|
||||
def train_sent(self, Tokens tokens, list gold_heads, list gold_labels):
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from libc.stdint cimport uint32_t
|
||||
|
||||
from numpy cimport ndarray
|
||||
cimport numpy
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
from thinc.typedefs cimport atom_t
|
||||
|
@ -29,11 +30,13 @@ cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
|
|||
cdef class Tokens:
|
||||
cdef Pool mem
|
||||
cdef Vocab vocab
|
||||
cdef list tag_names
|
||||
cdef dict pos_scheme
|
||||
|
||||
|
||||
cdef TokenC* data
|
||||
|
||||
|
||||
cdef unicode _string
|
||||
cdef list _tag_strings
|
||||
cdef list _dep_strings
|
||||
|
||||
cdef int length
|
||||
cdef int max_length
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
# cython: embedsignature=True
|
||||
from cython.view cimport array as cvarray
|
||||
|
||||
from preshed.maps cimport PreshMap
|
||||
from preshed.counter cimport PreshCounter
|
||||
|
@ -9,6 +8,7 @@ from .typedefs cimport attr_id_t, attr_t
|
|||
from .typedefs cimport LEMMA
|
||||
from .typedefs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||
from .typedefs cimport POS, LEMMA
|
||||
from .typedefs import UNIV_TAG_NAMES
|
||||
|
||||
from unidecode import unidecode
|
||||
|
||||
|
@ -84,6 +84,8 @@ cdef class Tokens:
|
|||
self.data = data_start + PADDING
|
||||
self.max_length = size
|
||||
self.length = 0
|
||||
self._tag_strings = [] # These will be set by the POS tagger and parser
|
||||
self._dep_strings = [] # The strings are arbitrary and model-specific.
|
||||
|
||||
def sentences(self):
|
||||
cdef int i
|
||||
|
@ -148,7 +150,7 @@ cdef class Tokens:
|
|||
return idx + t.lex.length
|
||||
|
||||
@cython.boundscheck(False)
|
||||
cpdef long[:,:] to_array(self, object attr_ids):
|
||||
cpdef long[:,:] to_array(self, object py_attr_ids):
|
||||
"""Given a list of M attribute IDs, export the tokens to a numpy ndarray
|
||||
of shape N*M, where N is the length of the sentence.
|
||||
|
||||
|
@ -162,8 +164,11 @@ cdef class Tokens:
|
|||
"""
|
||||
cdef int i, j
|
||||
cdef attr_id_t feature
|
||||
cdef long[:,:] output = cvarray(shape=(self.length, len(attr_ids)),
|
||||
itemsize=sizeof(long), format="l")
|
||||
cdef numpy.ndarray[long, ndim=2] output
|
||||
# Make an array from the attributes --- otherwise our inner loop is Python
|
||||
# dict iteration.
|
||||
cdef numpy.ndarray[long, ndim=1] attr_ids = numpy.asarray(py_attr_ids)
|
||||
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int)
|
||||
for i in range(self.length):
|
||||
for j, feature in enumerate(attr_ids):
|
||||
output[i, j] = get_token_attr(&self.data[i], feature)
|
||||
|
@ -232,6 +237,7 @@ cdef class Token:
|
|||
self.sentiment = t.lex.sentiment
|
||||
self.flags = t.lex.flags
|
||||
self.lemma = t.lemma
|
||||
self.pos = t.pos
|
||||
self.tag = t.tag
|
||||
self.dep = t.dep
|
||||
self.repvec = numpy.asarray(<float[:300,]> t.lex.repvec)
|
||||
|
@ -248,6 +254,24 @@ cdef class Token:
|
|||
"""
|
||||
return self._seq.data[self.i].lex.length
|
||||
|
||||
def nbor(self, int i=1):
|
||||
return Token(self._seq, self.i + i)
|
||||
|
||||
def child(self, int i=1):
|
||||
cdef const TokenC* t = &self._seq.data[self.i]
|
||||
if i == 0:
|
||||
return self
|
||||
elif i >= 1:
|
||||
if t.r_kids == 0:
|
||||
return None
|
||||
else:
|
||||
return Token(self._seq, _nth_significant_bit(t.r_kids, i))
|
||||
else:
|
||||
if t.l_kids == 0:
|
||||
return None
|
||||
else:
|
||||
return Token(self._seq, _nth_significant_bit(t.l_kids, i))
|
||||
|
||||
property head:
|
||||
"""The token predicted by the parser to be the head of the current token."""
|
||||
def __get__(self):
|
||||
|
@ -290,10 +314,26 @@ cdef class Token:
|
|||
cdef unicode py_ustr = self._seq.vocab.strings[t.lemma]
|
||||
return py_ustr
|
||||
|
||||
property pos_:
|
||||
def __get__(self):
|
||||
id_to_string = {id_: string for string, id_ in UNIV_TAG_NAMES.items()}
|
||||
return id_to_string[self.pos]
|
||||
|
||||
property tag_:
|
||||
def __get__(self):
|
||||
return self._seq.tag_names[self.tag]
|
||||
return self._seq._tag_strings[self.tag]
|
||||
|
||||
property dep_:
|
||||
def __get__(self):
|
||||
return self._seq.dep_names[self.dep]
|
||||
return self._seq._dep_strings[self.dep]
|
||||
|
||||
|
||||
|
||||
cdef inline uint32_t _nth_significant_bit(uint32_t bits, int n) nogil:
|
||||
cdef int i
|
||||
for i in range(32):
|
||||
if bits & (1 << i):
|
||||
n -= 1
|
||||
if n < 1:
|
||||
return i
|
||||
return 0
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
UNIV_TAG_NAMES = {
|
||||
"NO_TAG": NO_TAG,
|
||||
"ADJ": ADJ,
|
||||
|
|
Loading…
Reference in New Issue