spaCy/spacy/lexeme.pyx

# cython: embedsignature=True
from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64

from libc.string cimport memset

from .orth cimport word_shape
from .typedefs cimport attr_t, flags_t
import numpy

from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from .attrs cimport IS_OOV


memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))


cdef class Lexeme:
    """An entry in the vocabulary.  A Lexeme has no string context --- it's a
    word-type, as opposed to a word token.  It therefore has no part-of-speech
    tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
    tag).
    """
    def __init__(self, Vocab vocab, int orth):
        self.vocab = vocab
        self.orth = orth
        self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
        assert self.c.orth == orth

    property lower:
        def __get__(self): return self.c.lower
        def __set__(self, int x): self.c.lower = x
    
    property norm:
        def __get__(self): return self.c.norm
        def __set__(self, int x): self.c.norm = x

    property shape:
        def __get__(self): return self.c.shape
        def __set__(self, int x): self.c.shape = x

    property prefix:
        def __get__(self): return self.c.prefix
        def __set__(self, int x): self.c.prefix = x

    property suffix:
        def __get__(self): return self.c.suffix
        def __set__(self, int x): self.c.suffix = x
    
    property orth_:
        def __get__(self):
            return self.vocab.strings[self.c.orth]

    property lower_:
        def __get__(self): return self.vocab.strings[self.c.lower]
        def __set__(self, unicode x): self.c.lower = self.vocab.strings[x]
 
    property norm_:
        def __get__(self): return self.c.norm
        def __set__(self, unicode x): self.c.norm = self.vocab.strings[x]
    
    property shape_:
        def __get__(self): return self.vocab.strings[self.c.shape]
        def __set__(self, unicode x): self.c.shape = self.vocab.strings[x]

    property prefix_:
        def __get__(self): return self.c.prefix
        def __set__(self, unicode x): self.c.prefix = self.vocab.strings[x]

    property suffix_:
        def __get__(self): return self.c.suffix
        def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x]

    property is_oov:
        def __get__(self): return Lexeme.check_flag(self.c, IS_OOV)
        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_OOV, x)

    property is_alpha:
        def __get__(self): return Lexeme.check_flag(self.c, IS_ALPHA)
        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ALPHA, x)
    
    property is_ascii:
        def __get__(self): return Lexeme.check_flag(self.c, IS_ASCII)
        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ASCII, x)

    property is_digit:
        def __get__(self): return Lexeme.check_flag(self.c, IS_DIGIT)
        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_DIGIT, x)

    property is_lower:
        def __get__(self): return Lexeme.check_flag(self.c, IS_LOWER)
        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_LOWER, x)

    property is_title:
        def __get__(self): return Lexeme.check_flag(self.c, IS_TITLE)
        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_TITLE, x)

    property is_punct:
        def __get__(self): return Lexeme.check_flag(self.c, IS_PUNCT)
        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_PUNCT, x)

    property is_space: 
        def __get__(self): return Lexeme.check_flag(self.c, IS_SPACE)
        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_SPACE, x)

    property like_url:
        def __get__(self): return Lexeme.check_flag(self.c, LIKE_URL)
        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_URL, x)
    
    property like_num:
        def __get__(self): return Lexeme.check_flag(self.c, LIKE_NUM)
        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_NUM, x)

    property like_email:
        def __get__(self): return Lexeme.check_flag(self.c, LIKE_EMAIL)
        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_EMAIL, x)
* Tests passing after refactor. API has obvious warts, particularly in Token and Lexeme 2015-01-14 13:33:16 +00:00			`# cython: embedsignature=True`
* Upd Tokens to use vector, with bounds checking. 2014-09-15 01:22:40 +00:00			`from cpython.ref cimport Py_INCREF`
* Switch from own memory class to cymem, in pip 2014-09-17 21:09:24 +00:00			`from cymem.cymem cimport Pool`
* Rewriting Lexeme serialization. 2014-10-29 12:19:38 +00:00			`from murmurhash.mrmr cimport hash64`
* Upd Tokens to use vector, with bounds checking. 2014-09-15 01:22:40 +00:00
* Remove the feature array stuff from Tokens class, and replace vector with array-based implementation, with padding. 2014-10-22 14:57:59 +00:00			`from libc.string cimport memset`

* Fix orth import 2015-01-05 07:49:19 +00:00			`from .orth cimport word_shape`
* Add supersense data to Lexeme objects. Add simple has_sense method to check the flag. 2015-07-01 16:50:37 +00:00			`from .typedefs cimport attr_t, flags_t`
* Work on word vectors, and other stuff 2015-01-17 05:21:17 +00:00			`import numpy`
* Restoring Lexeme-as-struct 2014-09-10 18:41:37 +00:00
* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects 2015-07-26 14:37:16 +00:00			`from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE`
			`from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP`
* Add is_oov property, and fix up handling of attributes 2015-07-26 23:50:06 +00:00			`from .attrs cimport IS_OOV`
* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects 2015-07-26 14:37:16 +00:00
* Revising data model of lexeme. Compiles. 2014-10-09 08:53:30 +00:00
* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-11 23:26:22 +00:00			`memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))`
* Revising data model of lexeme. Compiles. 2014-10-09 08:53:30 +00:00

* Tmp. Refactoring, introducing a Lexeme PyObject. 2015-01-12 00:23:44 +00:00			`cdef class Lexeme:`
* Add docstring to Lexeme 2015-01-24 09:48:34 +00:00			`"""An entry in the vocabulary. A Lexeme has no string context --- it's a`
			`word-type, as opposed to a word token. It therefore has no part-of-speech`
			`tag, dependency parse, or lemma (lemmatization depends on the part-of-speech`
			`tag).`
			`"""`
* Tmp 2015-08-22 20:04:34 +00:00			`def __init__(self, Vocab vocab, int orth):`
			`self.vocab = vocab`
			`self.orth = orth`
* Work on language-independent refactoring 2015-08-23 18:49:18 +00:00			`self.c = <LexemeC><void>vocab.get_by_orth(vocab.mem, orth)`
			`assert self.c.orth == orth`
* Tmp 2015-08-22 20:04:34 +00:00
			`property lower:`
			`def __get__(self): return self.c.lower`
			`def __set__(self, int x): self.c.lower = x`

			`property norm:`
			`def __get__(self): return self.c.norm`
			`def __set__(self, int x): self.c.norm = x`

			`property shape:`
			`def __get__(self): return self.c.shape`
			`def __set__(self, int x): self.c.shape = x`

			`property prefix:`
			`def __get__(self): return self.c.prefix`
			`def __set__(self, int x): self.c.prefix = x`

			`property suffix:`
			`def __get__(self): return self.c.suffix`
			`def __set__(self, int x): self.c.suffix = x`

			`property orth_:`
			`def __get__(self):`
			`return self.vocab.strings[self.c.orth]`

			`property lower_:`
			`def __get__(self): return self.vocab.strings[self.c.lower]`
			`def __set__(self, unicode x): self.c.lower = self.vocab.strings[x]`

			`property norm_:`
			`def __get__(self): return self.c.norm`
			`def __set__(self, unicode x): self.c.norm = self.vocab.strings[x]`

			`property shape_:`
			`def __get__(self): return self.vocab.strings[self.c.shape]`
			`def __set__(self, unicode x): self.c.shape = self.vocab.strings[x]`
* Add a has_repvec property to Lexeme, and a check function to check flags 2015-02-07 13:42:44 +00:00
* Tmp 2015-08-22 20:04:34 +00:00			`property prefix_:`
			`def __get__(self): return self.c.prefix`
			`def __set__(self, unicode x): self.c.prefix = self.vocab.strings[x]`
* Add a has_repvec property to Lexeme, and a check function to check flags 2015-02-07 13:42:44 +00:00
* Tmp 2015-08-22 20:04:34 +00:00			`property suffix_:`
			`def __get__(self): return self.c.suffix`
			`def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x]`
* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects 2015-07-26 14:37:16 +00:00
* Add is_oov property, and fix up handling of attributes 2015-07-26 23:50:06 +00:00			`property is_oov:`
* Tmp 2015-08-22 20:04:34 +00:00			`def __get__(self): return Lexeme.check_flag(self.c, IS_OOV)`
			`def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_OOV, x)`
* Add is_oov property, and fix up handling of attributes 2015-07-26 23:50:06 +00:00
* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects 2015-07-26 14:37:16 +00:00			`property is_alpha:`
* Tmp 2015-08-22 20:04:34 +00:00			`def __get__(self): return Lexeme.check_flag(self.c, IS_ALPHA)`
			`def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ALPHA, x)`
* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects 2015-07-26 14:37:16 +00:00
			`property is_ascii:`
* Tmp 2015-08-22 20:04:34 +00:00			`def __get__(self): return Lexeme.check_flag(self.c, IS_ASCII)`
			`def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ASCII, x)`
* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects 2015-07-26 14:37:16 +00:00
			`property is_digit:`
* Tmp 2015-08-22 20:04:34 +00:00			`def __get__(self): return Lexeme.check_flag(self.c, IS_DIGIT)`
			`def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_DIGIT, x)`
* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects 2015-07-26 14:37:16 +00:00
			`property is_lower:`
* Tmp 2015-08-22 20:04:34 +00:00			`def __get__(self): return Lexeme.check_flag(self.c, IS_LOWER)`
			`def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_LOWER, x)`
* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects 2015-07-26 14:37:16 +00:00
			`property is_title:`
* Tmp 2015-08-22 20:04:34 +00:00			`def __get__(self): return Lexeme.check_flag(self.c, IS_TITLE)`
			`def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_TITLE, x)`
* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects 2015-07-26 14:37:16 +00:00
			`property is_punct:`
* Tmp 2015-08-22 20:04:34 +00:00			`def __get__(self): return Lexeme.check_flag(self.c, IS_PUNCT)`
			`def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_PUNCT, x)`
* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects 2015-07-26 14:37:16 +00:00
			`property is_space:`
* Tmp 2015-08-22 20:04:34 +00:00			`def __get__(self): return Lexeme.check_flag(self.c, IS_SPACE)`
			`def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_SPACE, x)`
* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects 2015-07-26 14:37:16 +00:00
			`property like_url:`
* Tmp 2015-08-22 20:04:34 +00:00			`def __get__(self): return Lexeme.check_flag(self.c, LIKE_URL)`
			`def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_URL, x)`
* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects 2015-07-26 14:37:16 +00:00
			`property like_num:`
* Work on language-independent refactoring 2015-08-23 18:49:18 +00:00			`def __get__(self): return Lexeme.check_flag(self.c, LIKE_NUM)`
* Tmp 2015-08-22 20:04:34 +00:00			`def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_NUM, x)`
* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects 2015-07-26 14:37:16 +00:00
			`property like_email:`
* Tmp 2015-08-22 20:04:34 +00:00			`def __get__(self): return Lexeme.check_flag(self.c, LIKE_EMAIL)`
			`def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_EMAIL, x)`