spaCy/spacy/lexeme.pyx

# cython: embedsignature=True
from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64

from libc.string cimport memset

from .orth cimport word_shape
from .typedefs cimport attr_t, flags_t
import numpy

from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from .attrs cimport IS_OOV


memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))


cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store,
                              const float* empty_vec) except -1:
    lex.length = props['length']
    lex.orth = string_store[props['orth']]
    lex.lower = string_store[props['lower']]
    lex.norm = string_store[props['norm']]
    lex.shape = string_store[props['shape']]
    lex.prefix = string_store[props['prefix']]
    lex.suffix = string_store[props['suffix']]

    lex.cluster = props['cluster']
    lex.prob = props['prob']
    lex.sentiment = props['sentiment']

    lex.flags = props['flags']
    lex.repvec = empty_vec


cdef class Lexeme:
    """An entry in the vocabulary.  A Lexeme has no string context --- it's a
    word-type, as opposed to a word token.  It therefore has no part-of-speech
    tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
    tag).
    """
    def __cinit__(self, int vec_size):
        self.repvec = numpy.ndarray(shape=(vec_size,), dtype=numpy.float32)

    @property
    def has_repvec(self):
        return self.l2_norm != 0

    cpdef bint check_flag(self, attr_id_t flag_id) except -1:
        cdef flags_t one = 1
        return self.flags & (one << flag_id)

    property is_oov:
        def __get__(self): return self.check_flag(IS_OOV)

    property is_alpha:
        def __get__(self): return self.check_flag(IS_ALPHA)
    
    property is_ascii:
        def __get__(self): return self.check_flag(IS_ASCII)

    property is_digit:
        def __get__(self): return self.check_flag(IS_DIGIT)

    property is_lower:
        def __get__(self): return self.check_flag(IS_LOWER)

    property is_title:
        def __get__(self): return self.check_flag(IS_TITLE)

    property is_punct:
        def __get__(self): return self.check_flag(IS_PUNCT)

    property is_space: 
        def __get__(self): return self.check_flag(IS_SPACE)

    property like_url:
        def __get__(self): return self.check_flag(LIKE_URL)
    
    property like_num:
        def __get__(self): return self.check_flag(LIKE_NUM)

    property like_email:
        def __get__(self): return self.check_flag(LIKE_EMAIL)
* Tests passing after refactor. API has obvious warts, particularly in Token and Lexeme 2015-01-14 13:33:16 +00:00			`# cython: embedsignature=True`
* Upd Tokens to use vector, with bounds checking. 2014-09-15 01:22:40 +00:00			`from cpython.ref cimport Py_INCREF`
* Switch from own memory class to cymem, in pip 2014-09-17 21:09:24 +00:00			`from cymem.cymem cimport Pool`
* Rewriting Lexeme serialization. 2014-10-29 12:19:38 +00:00			`from murmurhash.mrmr cimport hash64`
* Upd Tokens to use vector, with bounds checking. 2014-09-15 01:22:40 +00:00
* Remove the feature array stuff from Tokens class, and replace vector with array-based implementation, with padding. 2014-10-22 14:57:59 +00:00			`from libc.string cimport memset`

* Fix orth import 2015-01-05 07:49:19 +00:00			`from .orth cimport word_shape`
* Add supersense data to Lexeme objects. Add simple has_sense method to check the flag. 2015-07-01 16:50:37 +00:00			`from .typedefs cimport attr_t, flags_t`
* Work on word vectors, and other stuff 2015-01-17 05:21:17 +00:00			`import numpy`
* Restoring Lexeme-as-struct 2014-09-10 18:41:37 +00:00
* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects 2015-07-26 14:37:16 +00:00			`from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE`
			`from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP`
* Add is_oov property, and fix up handling of attributes 2015-07-26 23:50:06 +00:00			`from .attrs cimport IS_OOV`
* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects 2015-07-26 14:37:16 +00:00
* Revising data model of lexeme. Compiles. 2014-10-09 08:53:30 +00:00
* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-11 23:26:22 +00:00			`memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))`
* Revising data model of lexeme. Compiles. 2014-10-09 08:53:30 +00:00

* Work on word vectors, and other stuff 2015-01-17 05:21:17 +00:00			`cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store,`
			`const float* empty_vec) except -1:`
* Tmp. Working on refactor. Compiles, must hook up lexical feats. 2015-01-13 13:03:48 +00:00			`lex.length = props['length']`
* Rename sic to orth 2015-01-22 15:08:25 +00:00			`lex.orth = string_store[props['orth']]`
Remove trailing whitespace 2015-04-19 08:31:31 +00:00			`lex.lower = string_store[props['lower']]`
			`lex.norm = string_store[props['norm']]`
			`lex.shape = string_store[props['shape']]`
* Tmp. Working on refactor. Compiles, must hook up lexical feats. 2015-01-13 13:03:48 +00:00			`lex.prefix = string_store[props['prefix']]`
			`lex.suffix = string_store[props['suffix']]`
Remove trailing whitespace 2015-04-19 08:31:31 +00:00
* Tmp. Working on refactor. Compiles, must hook up lexical feats. 2015-01-13 13:03:48 +00:00			`lex.cluster = props['cluster']`
			`lex.prob = props['prob']`
			`lex.sentiment = props['sentiment']`

			`lex.flags = props['flags']`
* Rename vec to repvec 2015-01-21 15:03:54 +00:00			`lex.repvec = empty_vec`
* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-11 23:26:22 +00:00

* Tmp. Refactoring, introducing a Lexeme PyObject. 2015-01-12 00:23:44 +00:00			`cdef class Lexeme:`
* Add docstring to Lexeme 2015-01-24 09:48:34 +00:00			`"""An entry in the vocabulary. A Lexeme has no string context --- it's a`
			`word-type, as opposed to a word token. It therefore has no part-of-speech`
			`tag, dependency parse, or lemma (lemmatization depends on the part-of-speech`
			`tag).`
			`"""`
* Work on word vectors, and other stuff 2015-01-17 05:21:17 +00:00			`def __cinit__(self, int vec_size):`
* Rename sic to orth 2015-01-22 15:08:25 +00:00			`self.repvec = numpy.ndarray(shape=(vec_size,), dtype=numpy.float32)`
* Add a has_repvec property to Lexeme, and a check function to check flags 2015-02-07 13:42:44 +00:00
			`@property`
			`def has_repvec(self):`
			`return self.l2_norm != 0`

* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects 2015-07-26 14:37:16 +00:00			`cpdef bint check_flag(self, attr_id_t flag_id) except -1:`
			`cdef flags_t one = 1`
			`return self.flags & (one << flag_id)`

* Add is_oov property, and fix up handling of attributes 2015-07-26 23:50:06 +00:00			`property is_oov:`
			`def __get__(self): return self.check_flag(IS_OOV)`

* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects 2015-07-26 14:37:16 +00:00			`property is_alpha:`
			`def __get__(self): return self.check_flag(IS_ALPHA)`

			`property is_ascii:`
			`def __get__(self): return self.check_flag(IS_ASCII)`

			`property is_digit:`
			`def __get__(self): return self.check_flag(IS_DIGIT)`

			`property is_lower:`
			`def __get__(self): return self.check_flag(IS_LOWER)`

			`property is_title:`
			`def __get__(self): return self.check_flag(IS_TITLE)`

			`property is_punct:`
			`def __get__(self): return self.check_flag(IS_PUNCT)`

			`property is_space:`
			`def __get__(self): return self.check_flag(IS_SPACE)`

			`property like_url:`
			`def __get__(self): return self.check_flag(LIKE_URL)`

			`property like_num:`
			`def __get__(self): return self.check_flag(LIKE_NUM)`

			`property like_email:`
			`def __get__(self): return self.check_flag(LIKE_EMAIL)`