diff --git a/spacy/lang.pxd b/spacy/lang.pxd index b03024847..ba9d0a779 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -27,8 +27,6 @@ cdef class Lexicon: cpdef readonly size_t size cpdef readonly StringStore strings - cdef vector[Lexeme*] lexemes - cpdef Lexeme lookup(self, unicode string) cdef Lexeme* get(self, String* s) except NULL diff --git a/spacy/lang.pyx b/spacy/lang.pyx index a09c28172..9323dc052 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -45,8 +45,9 @@ cdef class Language: self.suffix_re = re.compile(suffix) self.infix_re = re.compile(infix) self.lexicon = Lexicon(lexemes) - self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes')) - self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings')) + if path.exists(path.join(util.DATA_DIR, name, 'lexemes')): + self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes')) + self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings')) self._load_special_tokenization(rules) cpdef Tokens tokenize(self, unicode string): @@ -240,18 +241,16 @@ cdef class Lexicon: for py_string, lexeme_dict in lexemes.iteritems(): string_from_unicode(&string, py_string) lexeme = self.mem.alloc(1, sizeof(Lexeme)) - lexeme[0] = lexeme_init(string.chars[:string.n], string.key, self.size, - self.strings, lexeme_dict) - self._dict.set(lexeme.hash, lexeme) - self.lexemes.push_back(lexeme) + lexeme[0] = lexeme_init(string.chars[:string.n], string.key, self.strings, + lexeme_dict) + self._dict.set(string.key, lexeme) self.size += 1 def set(self, unicode py_string, dict lexeme_dict): cdef String string string_from_unicode(&string, py_string) cdef Lexeme* lex = self.get(&string) - lex[0] = lexeme_init(string.chars[:string.n], string.key, lex.i, - self.strings, lexeme_dict) + lex[0] = lexeme_init(string.chars[:string.n], string.key, self.strings, lexeme_dict) cdef Lexeme* get(self, String* string) except NULL: cdef Lexeme* lex @@ -259,10 +258,8 @@ cdef class Lexicon: if lex != NULL: return lex lex = self.mem.alloc(sizeof(Lexeme), 1) - lex[0] = lexeme_init(string.chars[:string.n], string.key, self.size, - self.strings, {}) - self._dict.set(lex.hash, lex) - self.lexemes.push_back(lex) + lex[0] = lexeme_init(string.chars[:string.n], string.key, self.strings, {}) + self._dict.set(string.key, lex) self.size += 1 return lex @@ -287,8 +284,15 @@ cdef class Lexicon: cdef FILE* fp = fopen(bytes_loc, 'wb') assert fp != NULL cdef size_t st - for i in range(self.size-1): - st = fwrite(self.lexemes[i], sizeof(Lexeme), 1, fp) + cdef hash_t key + for i in range(self._dict.length): + key = self._dict.c_map.cells[i].key + if key == 0: + continue + lexeme = self._dict.c_map.cells[i].value + st = fwrite(&key, sizeof(key), 1, fp) + assert st == 1 + st = fwrite(lexeme, sizeof(Lexeme), 1, fp) assert st == 1 st = fclose(fp) assert st == 0 @@ -300,14 +304,17 @@ cdef class Lexicon: assert fp != NULL cdef size_t st cdef Lexeme* lexeme + cdef hash_t key i = 0 while True: + st = fread(&key, sizeof(key), 1, fp) + if st != 1: + break lexeme = self.mem.alloc(sizeof(Lexeme), 1) st = fread(lexeme, sizeof(Lexeme), 1, fp) if st != 1: break - self.lexemes.push_back(lexeme) - self._dict.set(lexeme.hash, lexeme) + self._dict.set(key, lexeme) i += 1 print "Load %d lexemes" % i fclose(fp) diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 235883e2a..b39a32522 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -23,8 +23,6 @@ cpdef enum: cdef struct Lexeme: - hash_t hash - atom_t i atom_t length atom_t sic @@ -46,7 +44,7 @@ cdef struct Lexeme: cdef Lexeme EMPTY_LEXEME -cpdef Lexeme init(unicode string, hash_t hashed, atom_t i, +cpdef Lexeme init(unicode string, hash_t hashed, StringStore store, dict props) except * diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 887210225..6760b3913 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -26,11 +26,9 @@ def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc): return flags -cpdef Lexeme init(unicode string, hash_t hashed, atom_t i, +cpdef Lexeme init(unicode string, hash_t hashed, StringStore store, dict props) except *: cdef Lexeme lex - lex.hash = hashed - lex.i = i lex.length = len(string) lex.sic = get_string_id(string, store) diff --git a/spacy/pos.pyx b/spacy/pos.pyx index 8722a1639..bd366c4c3 100644 --- a/spacy/pos.pyx +++ b/spacy/pos.pyx @@ -128,7 +128,7 @@ cdef int get_atoms(atom_t* atoms, Lexeme* p2, Lexeme* p1, Lexeme* n0, Lexeme* n1 cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil: - atoms[0] = lex.i + atoms[0] = lex.sic atoms[1] = lex.cluster atoms[2] = lex.norm atoms[3] = lex.shape diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index d6b655074..616353e8f 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -31,7 +31,6 @@ cdef class Token: cdef public int idx cdef public int pos - cdef public atom_t id cdef public atom_t cluster cdef public atom_t length cdef public atom_t lex_pos diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 6abfd5b6a..f1a96d2ae 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -107,7 +107,6 @@ cdef class Token: self.idx = idx self.pos = pos - self.id = lex['i'] self.cluster = lex['cluster'] self.length = lex['length'] self.lex_pos = lex['pos']