mirror of https://github.com/explosion/spaCy.git
* Moving back to lexeme structs
This commit is contained in:
parent
b488224c09
commit
e567713429
|
@ -105,7 +105,6 @@ cdef class Language:
|
|||
for i, substring in enumerate(substrings):
|
||||
lexemes.append(self.lexicon.lookup(substring))
|
||||
self.cache[string] = lexemes
|
||||
|
||||
cdef Lexeme lexeme
|
||||
for lexeme in lexemes:
|
||||
tokens.append(lexeme)
|
||||
|
@ -178,9 +177,11 @@ cdef class Lexicon:
|
|||
Returns:
|
||||
lexeme (Lexeme): A reference to a lexical type.
|
||||
"""
|
||||
cdef Lexeme lexeme
|
||||
assert len(string) != 0
|
||||
if string in self._dict:
|
||||
return self._dict[string]
|
||||
lexeme = self._dict[string]
|
||||
return lexeme
|
||||
|
||||
cdef Lexeme word = Lexeme(string, 0, 0, {}, {}, self._string_features,
|
||||
self._flag_features)
|
||||
|
|
|
@ -1,20 +1,11 @@
|
|||
from .typedefs cimport hash_t, utf8_t, flag_t, id_t
|
||||
|
||||
from spacy.lexeme cimport LexemeC
|
||||
|
||||
DEF MAX_FLAG = 64
|
||||
|
||||
|
||||
cdef class Lexeme:
|
||||
# NB: the readonly keyword refers to _Python_ access. The attributes are
|
||||
# writeable from Cython.
|
||||
cpdef readonly size_t length
|
||||
cpdef readonly double prob
|
||||
cpdef readonly size_t cluster
|
||||
|
||||
cpdef readonly unicode string
|
||||
cpdef readonly list views
|
||||
|
||||
cdef readonly flag_t flags
|
||||
cdef LexemeC* _c
|
||||
|
||||
cpdef bint check_flag(self, size_t flag_id) except *
|
||||
cpdef unicode string_view(self, size_t view_id)
|
||||
|
|
|
@ -1,9 +1,12 @@
|
|||
# cython: profile=True
|
||||
# cython: embedsignature=True
|
||||
|
||||
|
||||
from libc.stdlib cimport calloc, free, realloc
|
||||
|
||||
from spacy.lexeme cimport lexeme_free, lexeme_init
|
||||
from spacy.lexeme cimport lexeme_check_flag, lexeme_string_view
|
||||
|
||||
|
||||
cdef class Lexeme:
|
||||
"""A lexical type --- a word, punctuation symbol, whitespace sequence, etc
|
||||
keyed by a case-sensitive unicode string. All tokens with the same string,
|
||||
|
@ -48,23 +51,34 @@ cdef class Lexeme:
|
|||
"""
|
||||
def __cinit__(self, unicode string, double prob, int cluster, dict case_stats,
|
||||
dict tag_stats, list string_features, list flag_features):
|
||||
self.prob = prob
|
||||
self.cluster = cluster
|
||||
self.length = len(string)
|
||||
self.string = string
|
||||
|
||||
self.views = []
|
||||
views = []
|
||||
cdef unicode view
|
||||
for string_feature in string_features:
|
||||
view = string_feature(string, prob, cluster, case_stats, tag_stats)
|
||||
self.views.append(view)
|
||||
views.append(view)
|
||||
|
||||
flags = set()
|
||||
for i, flag_feature in enumerate(flag_features):
|
||||
if flag_feature(string, prob, case_stats, tag_stats):
|
||||
self.flags |= (1 << i)
|
||||
if (1 << i):
|
||||
flags.add(i)
|
||||
self._c = lexeme_init(string, prob, cluster, views, flags)
|
||||
|
||||
def __dealloc__(self):
|
||||
pass
|
||||
lexeme_free(self._c)
|
||||
|
||||
property string:
|
||||
def __get__(self):
|
||||
cdef bytes utf8_string = self._c.string
|
||||
cdef unicode string = utf8_string.decode('utf8')
|
||||
return string
|
||||
|
||||
property prob:
|
||||
def __get__(self): return self._c.prob
|
||||
property cluster:
|
||||
def __get__(self): return self._c.cluster
|
||||
property length:
|
||||
def __get__(self): return self._c.length
|
||||
|
||||
cpdef bint check_flag(self, size_t flag_id) except *:
|
||||
"""Lexemes may store language-specific boolean features in a bit-field,
|
||||
|
@ -80,7 +94,7 @@ cdef class Lexeme:
|
|||
>>> lexeme.check_flag(EN.OFT_UPPER)
|
||||
True
|
||||
"""
|
||||
return self.flags & (1 << flag_id)
|
||||
return lexeme_check_flag(self._c, flag_id)
|
||||
|
||||
cpdef unicode string_view(self, size_t view_id):
|
||||
"""Lexemes may store language-specific string-view features, obtained
|
||||
|
@ -100,4 +114,4 @@ cdef class Lexeme:
|
|||
>>> lexeme.string_view(EN.NON_SPARSE)
|
||||
u'Xxxx'
|
||||
"""
|
||||
return self.views[view_id]
|
||||
return lexeme_string_view(self._c, view_id)
|
||||
|
|
Loading…
Reference in New Issue