spaCy/spacy/lexeme.pyx

64 lines
2.1 KiB
Cython
Raw Normal View History

from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool
2014-10-29 12:19:38 +00:00
from murmurhash.mrmr cimport hash64
from libc.string cimport memset
import orth
2014-09-10 18:41:37 +00:00
from .utf8string cimport Utf8Str
2014-09-10 18:41:37 +00:00
OOV_DIST_FLAGS = 0
memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
2014-10-29 12:19:38 +00:00
def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc):
cdef flag_t flags = 0
flags |= orth.is_alpha(string) << IS_ALPHA
flags |= orth.is_ascii(string) << IS_ASCII
flags |= orth.is_digit(string) << IS_DIGIT
flags |= orth.is_lower(string) << IS_LOWER
flags |= orth.is_punct(string) << IS_PUNCT
flags |= orth.is_space(string) << IS_SPACE
flags |= orth.is_title(string) << IS_TITLE
flags |= orth.is_upper(string) << IS_UPPER
return flags
2014-10-10 08:17:22 +00:00
cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
2014-10-29 12:19:38 +00:00
StringStore store, dict props) except *:
cdef Lexeme lex
lex.id = i
2014-10-29 12:19:38 +00:00
lex.length = len(string)
lex.sic = get_string_id(string, store)
lex.cluster = props.get('cluster', 0)
lex.postype = props.get('postype', 0)
2014-10-29 12:19:38 +00:00
lex.supersense = props.get('supersense', 0)
lex.prob = props.get('prob', 0)
cdef float upper_pc = props.get('upper_pc', 0.0)
cdef float lower_pc = props.get('lower_pc', 0.0)
cdef float title_pc = props.get('title_pc', 0.0)
lex.prefix = get_string_id(string[0], store)
lex.suffix = get_string_id(string[-3:], store)
if upper_pc or lower_pc or title_pc:
canon_cased = orth.canon_case(string, upper_pc, title_pc, lower_pc)
lex.norm = get_string_id(canon_cased, store)
else:
lex.norm = lex.sic
2014-10-29 12:19:38 +00:00
lex.shape = get_string_id(orth.word_shape(string), store)
lex.asciied = get_string_id(orth.asciied(string), store)
non_sparse = orth.non_sparse(string, lex.prob, lex.cluster, upper_pc, title_pc, lower_pc)
lex.vocab10k = get_string_id(non_sparse, store)
lex.flags = get_flags(string, upper_pc, title_pc, lower_pc)
return lex
cdef id_t get_string_id(unicode string, StringStore store) except 0:
cdef bytes byte_string = string.encode('utf8')
cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string))
2014-10-29 12:19:38 +00:00
return orig_str.i