diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index 633ba48e4..933de124e 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -12,7 +12,10 @@ from .attrs import get_flags def get_lex_props(string): - return {'flags': get_flags(string), 'dense': 1} + return {'flags': get_flags(string), 'length': len(string), + 'sic': string, 'norm1': string, 'norm2': string, 'shape': string, + 'prefix': string[0], 'suffix': string[-3:], 'cluster': 0, 'prob': 0, + 'sentiment': 0} LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data') @@ -45,7 +48,7 @@ class English(object): """ def __init__(self, data_dir=LOCAL_DATA_DIR): self._data_dir = data_dir - self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab'), + self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None, get_lex_props=get_lex_props) tag_names = list(POS_TAGS.keys()) tag_names.sort() diff --git a/spacy/en/pos.pyx b/spacy/en/pos.pyx index 1ff465442..ab0187ec3 100644 --- a/spacy/en/pos.pyx +++ b/spacy/en/pos.pyx @@ -283,12 +283,12 @@ cdef class EnPosTagger: cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1: if self.lemmatizer is None: return lex.sic - cdef bytes py_string = self.strings[lex.sic] + cdef unicode py_string = self.strings[lex.sic] if pos != NOUN and pos != VERB and pos != ADJ: return lex.sic cdef set lemma_strings cdef unicode lemma_string - lemma_strings = self.lemmatizer(py_string.decode('utf8'), pos) + lemma_strings = self.lemmatizer(py_string, pos) lemma_string = sorted(lemma_strings)[0] lemma = self.strings.intern(lemma_string.encode('utf8'), len(lemma_string)).i return lemma diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index edd871bde..8686f8e6a 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -7,10 +7,8 @@ from .strings cimport StringStore cdef LexemeC EMPTY_LEXEME -cdef LexemeC init(id_t i, unicode string, hash_t hashed, StringStore store, - dict props) except * +cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings) except -1 - cdef class Lexeme: cdef const float* vec diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 8eac4b753..dfc82d46e 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -5,27 +5,27 @@ from murmurhash.mrmr cimport hash64 from libc.string cimport memset from .orth cimport word_shape +from .typedefs cimport attr_t memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) -cdef LexemeC init(id_t i, unicode string, hash_t hashed, - StringStore string_store, dict props) except *: - cdef LexemeC lex - lex.id = i - lex.length = len(string) - lex.sic = string_store[string] - - lex.cluster = props.get('cluster', 0) - lex.prob = props.get('prob', 0) +cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store) except -1: - lex.prefix = string_store[string[:1]] - lex.suffix = string_store[string[-3:]] - lex.shape = string_store[word_shape(string)] - - lex.flags = props.get('flags', 0) - return lex + lex.length = props['length'] + lex.sic = string_store[props['sic']] + lex.norm1 = string_store[props['norm1']] + lex.norm2 = string_store[props['norm2']] + lex.shape = string_store[props['shape']] + lex.prefix = string_store[props['prefix']] + lex.suffix = string_store[props['suffix']] + + lex.cluster = props['cluster'] + lex.prob = props['prob'] + lex.sentiment = props['sentiment'] + + lex.flags = props['flags'] cdef class Lexeme: diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 29afde45c..df9b89dc3 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -67,7 +67,7 @@ cdef class StringStore: if string_or_id < 1 or string_or_id >= self.size: raise IndexError(string_or_id) utf8str = &self.strings[string_or_id] - return utf8str.chars[:utf8str.length] + return utf8str.chars[:utf8str.length].decode('utf8') elif isinstance(string_or_id, bytes): utf8str = self.intern(string_or_id, len(string_or_id)) return utf8str.i diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index 35a7c2b63..101bcad63 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -42,32 +42,5 @@ cdef class Tokens: cdef class Token: - cdef cvarray vec - - cdef readonly flags_t flags - - cdef readonly attr_t id - cdef readonly attr_t sic - cdef readonly attr_t dense - cdef readonly attr_t shape - cdef readonly attr_t prefix - cdef readonly attr_t suffix - - cdef readonly attr_t length - cdef readonly attr_t cluster - cdef readonly attr_t pos_type - - cdef readonly float prob - cdef readonly float sentiment - - cdef readonly Morphology morph - cdef readonly univ_tag_t pos - cdef readonly int fine_pos - cdef readonly int idx - cdef readonly int lemma - cdef readonly int sense - cdef readonly int dep_tag - - cdef readonly int head_offset - cdef readonly uint32_t l_kids - cdef readonly uint32_t r_kids + cdef readonly Tokens _seq + cdef readonly int i diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 85f2b4f69..61aab89b1 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -85,7 +85,7 @@ cdef class Tokens: token (Token): """ bounds_check(i, self.length, PADDING) - return cinit_token(&self.data[i]) + return Token(self, i) def __iter__(self): """Iterate over the tokens. @@ -174,38 +174,26 @@ cdef class Tokens: self.data[i].lex = &EMPTY_LEXEME -cdef Token cinit_token(const TokenC* c_tok): - cdef Token py_tok = Token.__new__(Token) - py_tok.morph = c_tok.morph - py_tok.pos = c_tok.pos - py_tok.fine_pos = c_tok.fine_pos - py_tok.idx = c_tok.idx - py_tok.lemma = c_tok.lemma - py_tok.sense = c_tok.sense - py_tok.dep_tag = c_tok.dep_tag - py_tok.head_offset = c_tok.head - py_tok.l_kids = c_tok.l_kids - py_tok.r_kids = c_tok.r_kids - return py_tok - - +@cython.freelist(64) cdef class Token: """An individual token. - """ - def __init__(self): - pass - #self._seq = tokens - #self.i = i - #def __unicode__(self): - # cdef const TokenC* t = &self._seq.data[self.i] - # cdef int end_idx = t.idx + t.lex.length - # if self.i + 1 == self._seq.length: - # return self.string - # if end_idx == t[1].idx: - # return self.string - # else: - # return self.string + ' ' + Internally, the Token is a tuple (i, tokens) --- it delegates to the Tokens + object. + """ + def __init__(self, Tokens tokens, int i): + self._seq = tokens + self.i = i + + def __unicode__(self): + cdef const TokenC* t = &self._seq.data[self.i] + cdef int end_idx = t.idx + t.lex.length + if self.i + 1 == self._seq.length: + return self.string + if end_idx == t[1].idx: + return self.string + else: + return self.string + ' ' def __len__(self): """The number of unicode code-points in the original string. @@ -213,87 +201,87 @@ cdef class Token: Returns: length (int): """ - return self.length + return self._seq.data[self.i].lex.length - #property idx: - # """The index into the original string at which the token starts. + property idx: + """The index into the original string at which the token starts. - # The following is supposed to always be true: - # - # >>> original_string[token.idx:token.idx len(token) == token.string - # """ - # def __get__(self): - # return self._seq.data[self.i].idx + The following is supposed to always be true: + + >>> original_string[token.idx:token.idx len(token) == token.string + """ + def __get__(self): + return self._seq.data[self.i].idx - #property cluster: - # """The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering - # - # Similar words have better-than-chance likelihood of having similar cluster - # IDs, although the clustering is quite noisy. Cluster IDs make good features, - # and help to make models slightly more robust to domain variation. + property cluster: + """The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering + + Similar words have better-than-chance likelihood of having similar cluster + IDs, although the clustering is quite noisy. Cluster IDs make good features, + and help to make models slightly more robust to domain variation. - # A common trick is to use only the first N bits of a cluster ID in a feature, - # as the more general part of the hierarchical clustering is often more accurate - # than the lower categories. + A common trick is to use only the first N bits of a cluster ID in a feature, + as the more general part of the hierarchical clustering is often more accurate + than the lower categories. - # To assist in this, I encode the cluster IDs little-endian, to allow a simple - # bit-mask: + To assist in this, I encode the cluster IDs little-endian, to allow a simple + bit-mask: - # >>> six_bits = cluster & (2**6 - 1) - # """ - # def __get__(self): - # return self._seq.data[self.i].lex.cluster + >>> six_bits = cluster & (2**6 - 1) + """ + def __get__(self): + return self._seq.data[self.i].lex.cluster - #property string: - # """The unicode string of the word, with no whitespace padding.""" - # def __get__(self): - # cdef const TokenC* t = &self._seq.data[self.i] - # if t.lex.sic == 0: - # return '' - # cdef bytes utf8string = self._seq.vocab.strings[t.lex.sic] - # return utf8string.decode('utf8') + property string: + """The unicode string of the word, with no whitespace padding.""" + def __get__(self): + cdef const TokenC* t = &self._seq.data[self.i] + if t.lex.sic == 0: + return '' + cdef unicode py_ustr = self._seq.vocab.strings[t.lex.sic] + return py_ustr - #property lemma: - # """The unicode string of the word's lemma. If no part-of-speech tag is - # assigned, the most common part-of-speech tag of the word is used. - # """ - # def __get__(self): - # cdef const TokenC* t = &self._seq.data[self.i] - # if t.lemma == 0: - # return self.string - # cdef bytes utf8string = self._seq.vocab.strings[t.lemma] - # return utf8string.decode('utf8') + property lemma: + """The unicode string of the word's lemma. If no part-of-speech tag is + assigned, the most common part-of-speech tag of the word is used. + """ + def __get__(self): + cdef const TokenC* t = &self._seq.data[self.i] + if t.lemma == 0: + return self.string + cdef unicode py_ustr = self._seq.vocab.strings[t.lemma] + return py_ustr - #property dep_tag: - # """The ID integer of the word's dependency label. If no parse has been - # assigned, defaults to 0. - # """ - # def __get__(self): - # return self._seq.data[self.i].dep_tag + property dep_tag: + """The ID integer of the word's dependency label. If no parse has been + assigned, defaults to 0. + """ + def __get__(self): + return self._seq.data[self.i].dep_tag - #property pos: - # """The ID integer of the word's part-of-speech tag, from the 13-tag - # Google Universal Tag Set. Constants for this tag set are available in - # spacy.typedefs. - # """ - # def __get__(self): - # return self._seq.data[self.i].pos + property pos: + """The ID integer of the word's part-of-speech tag, from the 13-tag + Google Universal Tag Set. Constants for this tag set are available in + spacy.typedefs. + """ + def __get__(self): + return self._seq.data[self.i].pos - #property fine_pos: - # """The ID integer of the word's fine-grained part-of-speech tag, as assigned - # by the tagger model. Fine-grained tags include morphological information, - # and other distinctions, and allow a more accurate tagger to be trained. - # """ + property fine_pos: + """The ID integer of the word's fine-grained part-of-speech tag, as assigned + by the tagger model. Fine-grained tags include morphological information, + and other distinctions, and allow a more accurate tagger to be trained. + """ - # def __get__(self): - # return self._seq.data[self.i].fine_pos + def __get__(self): + return self._seq.data[self.i].fine_pos - #property sic: - # def __get__(self): - # return self._seq.data[self.i].lex.sic + property sic: + def __get__(self): + return self._seq.vocab.strings[self._seq.data[self.i].lex.sic] - #property head: - # """The token predicted by the parser to be the head of the current token.""" - # def __get__(self): - # cdef const TokenC* t = &self._seq.data[self.i] - # return Token(self._seq, self.i + t.head) + property head: + """The token predicted by the parser to be the head of the current token.""" + def __get__(self): + cdef const TokenC* t = &self._seq.data[self.i] + return Token(self._seq, self.i + t.head) diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index 203d3c7a5..25d62cffe 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -24,12 +24,13 @@ cdef struct _Cached: cdef class Vocab: - cpdef public get_lex_props + cpdef public lexeme_props_getter cdef Pool mem cpdef readonly StringStore strings - cdef vector[LexemeC*] lexemes + cdef vector[const LexemeC*] lexemes cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL + cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 cdef PreshMap _map diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 04943ac33..4043b14e0 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -5,7 +5,7 @@ from os import path import codecs from .lexeme cimport EMPTY_LEXEME -from .lexeme cimport init as lexeme_init +from .lexeme cimport set_lex_struct_props from .lexeme cimport Lexeme_cinit from .strings cimport slice_unicode from .strings cimport hash_string @@ -21,24 +21,6 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) EMPTY_LEXEME.vec = EMPTY_VEC -cdef LexemeC init_lexeme(id_t i, unicode string, hash_t hashed, - StringStore string_store, dict props) except *: - cdef LexemeC lex - lex.id = i - lex.length = len(string) - lex.sic = string_store[string] - - lex.cluster = props.get('cluster', 0) - lex.prob = props.get('prob', 0) - - lex.prefix = string_store[string[:1]] - lex.suffix = string_store[string[-3:]] - lex.shape = string_store[word_shape(string)] - - lex.flags = props.get('flags', 0) - return lex - - cdef class Vocab: '''A map container for a language's LexemeC structs. ''' @@ -47,7 +29,7 @@ cdef class Vocab: self._map = PreshMap(2 ** 20) self.strings = StringStore() self.lexemes.push_back(&EMPTY_LEXEME) - self.get_lex_props = get_lex_props + self.lexeme_props_getter = get_lex_props if data_dir is not None: if not path.exists(data_dir): @@ -63,32 +45,36 @@ cdef class Vocab: """The current number of lexemes stored.""" return self.lexemes.size() - cdef const LexemeC* get(self, Pool mem, UniStr* string) except NULL: + cdef const LexemeC* get(self, Pool mem, UniStr* c_str) except NULL: '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme if necessary, using memory acquired from the given pool. If the pool is the lexicon's own memory, the lexeme is saved in the lexicon.''' cdef LexemeC* lex - lex = self._map.get(string.key) + lex = self._map.get(c_str.key) if lex != NULL: return lex - if string.n < 3: + if c_str.n < 3: mem = self.mem - cdef unicode py_string = string.chars[:string.n] + cdef unicode py_str = c_str.chars[:c_str.n] lex = mem.alloc(sizeof(LexemeC), 1) - lex[0] = init_lexeme(self.lexemes.size(), py_string, string.key, self.strings, - self.get_lex_props(py_string)) + props = self.lexeme_props_getter(py_str) + set_lex_struct_props(lex, props, self.strings) if mem is self.mem: - self._map.set(string.key, lex) - while self.lexemes.size() < (lex.id + 1): - self.lexemes.push_back(&EMPTY_LEXEME) - self.lexemes[lex.id] = lex + lex.id = self.lexemes.size() + self._add_lex_to_vocab(c_str.key, lex) else: - lex[0].id = 1 + lex.id = 1 return lex + cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1: + self._map.set(key, lex) + while self.lexemes.size() < (lex.id + 1): + self.lexemes.push_back(&EMPTY_LEXEME) + self.lexemes[lex.id] = lex + def __getitem__(self, id_or_string): '''Retrieve a lexeme, given an int ID or a unicode string. If a previously - unseen unicode string is given, a new LexemeC is created and stored. + unseen unicode string is given, a new lexeme is created and stored. Args: id_or_string (int or unicode): The integer ID of a word, or its unicode @@ -100,24 +86,28 @@ cdef class Vocab: lexeme (Lexeme): An instance of the Lexeme Python class, with data copied on instantiation. ''' - cdef UniStr string + cdef UniStr c_str cdef const LexemeC* lexeme if type(id_or_string) == int: if id_or_string >= self.lexemes.size(): raise IndexError lexeme = self.lexemes.at(id_or_string) else: - slice_unicode(&string, id_or_string, 0, len(id_or_string)) - lexeme = self.get(self.mem, &string) + slice_unicode(&c_str, id_or_string, 0, len(id_or_string)) + lexeme = self.get(self.mem, &c_str) return Lexeme_cinit(lexeme, self.strings) - def __setitem__(self, unicode uni_string, dict props): - cdef UniStr s - slice_unicode(&s, uni_string, 0, len(uni_string)) - # Cast through the const here, since we're allowed to change our own - # LexemeCs. - lex = self.get(self.mem, &s) - lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props) + def __setitem__(self, unicode py_str, dict props): + cdef UniStr c_str + slice_unicode(&c_str, py_str, 0, len(py_str)) + cdef LexemeC* lex + lex = self._map.get(c_str.key) + if lex == NULL: + lex = self.mem.alloc(sizeof(LexemeC), 1) + lex.id = self.lexemes.size() + self._add_lex_to_vocab(c_str.key, lex) + set_lex_struct_props(lex, props, self.strings) + assert lex.sic < 1000000 def dump(self, loc): if path.exists(loc): @@ -154,6 +144,7 @@ cdef class Vocab: if st != 1: break lexeme = self.mem.alloc(sizeof(LexemeC), 1) + lexeme.vec = EMPTY_VEC st = fread(lexeme, sizeof(LexemeC), 1, fp) if st != 1: break