spaCy/spacy/lexeme.pyx

from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64

from libc.string cimport memset

import orth

from .utf8string cimport Utf8Str

OOV_DIST_FLAGS = 0

memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))


def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc):
    cdef flag_t flags = 0
    flags |= orth.is_alpha(string) << IS_ALPHA
    flags |= orth.is_ascii(string) << IS_ASCII
    flags |= orth.is_digit(string) << IS_DIGIT
    flags |= orth.is_lower(string) << IS_LOWER
    flags |= orth.is_punct(string) << IS_PUNCT
    flags |= orth.is_space(string) << IS_SPACE
    flags |= orth.is_title(string) << IS_TITLE
    flags |= orth.is_upper(string) << IS_UPPER
    return flags


cpdef Lexeme init(unicode string, hash_t hashed,
                  StringStore store, dict props) except *:
    cdef Lexeme lex
    lex.length = len(string)
    lex.sic = get_string_id(string, store)
    
    lex.cluster = props.get('cluster', 0)
    lex.pos = props.get('pos', 0)
    lex.supersense = props.get('supersense', 0)
    lex.prob = props.get('prob', 0)

    cdef float upper_pc = props.get('upper_pc', 0.0)
    cdef float lower_pc = props.get('lower_pc', 0.0)
    cdef float title_pc = props.get('title_pc', 0.0)

    lex.prefix = get_string_id(string[0], store)
    lex.suffix = get_string_id(string[-3:], store)
    canon_cased = orth.canon_case(string, upper_pc, title_pc, lower_pc)
    lex.norm = get_string_id(canon_cased, store)
    lex.shape = get_string_id(orth.word_shape(string), store)
    lex.asciied = get_string_id(orth.asciied(string), store)
    non_sparse = orth.non_sparse(string, lex.prob, lex.cluster, upper_pc, title_pc, lower_pc)
    lex.vocab10k = get_string_id(non_sparse, store)
    lex.flags = get_flags(string, upper_pc, title_pc, lower_pc)
    return lex

cdef atom_t get_string_id(unicode string, StringStore store) except 0:
    cdef bytes byte_string = string.encode('utf8')
    cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string))
    return orig_str.i
* Upd Tokens to use vector, with bounds checking. 2014-09-15 01:22:40 +00:00			`from cpython.ref cimport Py_INCREF`
* Switch from own memory class to cymem, in pip 2014-09-17 21:09:24 +00:00			`from cymem.cymem cimport Pool`
* Rewriting Lexeme serialization. 2014-10-29 12:19:38 +00:00			`from murmurhash.mrmr cimport hash64`
* Upd Tokens to use vector, with bounds checking. 2014-09-15 01:22:40 +00:00
* Remove the feature array stuff from Tokens class, and replace vector with array-based implementation, with padding. 2014-10-22 14:57:59 +00:00			`from libc.string cimport memset`

* Revising data model of lexeme. Compiles. 2014-10-09 08:53:30 +00:00			`import orth`
* Restoring Lexeme-as-struct 2014-09-10 18:41:37 +00:00
* Large refactor, particularly to Python API 2014-10-23 13:59:17 +00:00			`from .utf8string cimport Utf8Str`
* Restoring Lexeme-as-struct 2014-09-10 18:41:37 +00:00
* Large refactor, particularly to Python API 2014-10-23 13:59:17 +00:00			`OOV_DIST_FLAGS = 0`
* Revising data model of lexeme. Compiles. 2014-10-09 08:53:30 +00:00
* Large refactor, particularly to Python API 2014-10-23 13:59:17 +00:00			`memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))`
* Revising data model of lexeme. Compiles. 2014-10-09 08:53:30 +00:00

* Rewriting Lexeme serialization. 2014-10-29 12:19:38 +00:00			`def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc):`
* Switch to new data model, tests passing 2014-10-09 21:11:31 +00:00			`cdef flag_t flags = 0`
* Large refactor, particularly to Python API 2014-10-23 13:59:17 +00:00			`flags \|= orth.is_alpha(string) << IS_ALPHA`
			`flags \|= orth.is_ascii(string) << IS_ASCII`
			`flags \|= orth.is_digit(string) << IS_DIGIT`
			`flags \|= orth.is_lower(string) << IS_LOWER`
			`flags \|= orth.is_punct(string) << IS_PUNCT`
			`flags \|= orth.is_space(string) << IS_SPACE`
			`flags \|= orth.is_title(string) << IS_TITLE`
			`flags \|= orth.is_upper(string) << IS_UPPER`
* Switch to new data model, tests passing 2014-10-09 21:11:31 +00:00			`return flags`
* Revising data model of lexeme. Compiles. 2014-10-09 08:53:30 +00:00
* Slight cleaning of tokenizer code 2014-10-10 08:17:22 +00:00
* Remove lexemes vector from Lexicon, and the id and hash attributes from Lexeme 2014-10-30 04:21:38 +00:00			`cpdef Lexeme init(unicode string, hash_t hashed,`
* Rewriting Lexeme serialization. 2014-10-29 12:19:38 +00:00			`StringStore store, dict props) except *:`
			`cdef Lexeme lex`
			`lex.length = len(string)`
			`lex.sic = get_string_id(string, store)`

			`lex.cluster = props.get('cluster', 0)`
			`lex.pos = props.get('pos', 0)`
			`lex.supersense = props.get('supersense', 0)`
			`lex.prob = props.get('prob', 0)`

			`cdef float upper_pc = props.get('upper_pc', 0.0)`
			`cdef float lower_pc = props.get('lower_pc', 0.0)`
			`cdef float title_pc = props.get('title_pc', 0.0)`

			`lex.prefix = get_string_id(string[0], store)`
			`lex.suffix = get_string_id(string[-3:], store)`
			`canon_cased = orth.canon_case(string, upper_pc, title_pc, lower_pc)`
			`lex.norm = get_string_id(canon_cased, store)`
			`lex.shape = get_string_id(orth.word_shape(string), store)`
			`lex.asciied = get_string_id(orth.asciied(string), store)`
			`non_sparse = orth.non_sparse(string, lex.prob, lex.cluster, upper_pc, title_pc, lower_pc)`
			`lex.vocab10k = get_string_id(non_sparse, store)`
			`lex.flags = get_flags(string, upper_pc, title_pc, lower_pc)`
			`return lex`

			`cdef atom_t get_string_id(unicode string, StringStore store) except 0:`
* Fiddle with the way strings are interned in lexeme 2014-09-15 04:34:45 +00:00			`cdef bytes byte_string = string.encode('utf8')`
* Large refactor, particularly to Python API 2014-10-23 13:59:17 +00:00			`cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string))`
* Rewriting Lexeme serialization. 2014-10-29 12:19:38 +00:00			`return orig_str.i`
No results found.