From d1cb30dbc439f9ce54967f32ca2726d11826be71 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 16 Jul 2015 19:29:02 +0200 Subject: [PATCH] * Remove unnecessary key and id properties from Utf8String. --- spacy/en/pos.pyx | 3 +-- spacy/strings.pxd | 2 +- spacy/strings.pyx | 15 ++++++++------- spacy/structs.pxd | 2 -- 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/spacy/en/pos.pyx b/spacy/en/pos.pyx index de795c1f3..1f09f364a 100644 --- a/spacy/en/pos.pyx +++ b/spacy/en/pos.pyx @@ -332,8 +332,7 @@ cdef class EnPosTagger: cdef unicode lemma_string lemma_strings = self.lemmatizer(py_string, pos) lemma_string = sorted(lemma_strings)[0] - bytes_string = lemma_string.encode('utf8') - lemma = self.strings.intern(bytes_string, len(bytes_string)).i + lemma = self.strings[lemma_string] return lemma def load_morph_exceptions(self, dict exc): diff --git a/spacy/strings.pxd b/spacy/strings.pxd index 178ae51b6..c86d33d1c 100644 --- a/spacy/strings.pxd +++ b/spacy/strings.pxd @@ -22,4 +22,4 @@ cdef class StringStore: cdef PreshMap _map cdef size_t _resize_at - cdef const Utf8Str* intern(self, char* chars, int length) except NULL + cdef const Utf8Str* intern(self, char* chars, int length, int* id_) except NULL diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 56df4d2f1..ca74f4044 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -67,6 +67,7 @@ cdef class StringStore: def __getitem__(self, object string_or_id): cdef bytes byte_string cdef const Utf8Str* utf8str + cdef int id_ if isinstance(string_or_id, int) or isinstance(string_or_id, long): if string_or_id == 0: return u'' @@ -75,16 +76,16 @@ cdef class StringStore: utf8str = &self.strings[string_or_id] return utf8str.chars[:utf8str.length].decode('utf8') elif isinstance(string_or_id, bytes): - utf8str = self.intern(string_or_id, len(string_or_id)) - return utf8str.i + utf8str = self.intern(string_or_id, len(string_or_id), &id_) + return id_ elif isinstance(string_or_id, unicode): byte_string = string_or_id.encode('utf8') - utf8str = self.intern(byte_string, len(byte_string)) - return utf8str.i + utf8str = self.intern(byte_string, len(byte_string), &id_) + return id_ else: raise TypeError(type(string_or_id)) - cdef const Utf8Str* intern(self, char* chars, int length) except NULL: + cdef const Utf8Str* intern(self, char* chars, int length, int* id_) except NULL: # 0 means missing, but we don't bother offsetting the index. We waste # slot 0 to simplify the code, because it doesn't matter. assert length != 0 @@ -97,7 +98,6 @@ cdef class StringStore: self.strings = self.mem.realloc(self.strings, self._resize_at * sizeof(Utf8Str)) i = self.size self.strings[i].i = self.size - self.strings[i].key = key self.strings[i].chars = self.mem.alloc(length, sizeof(char)) memcpy(self.strings[i].chars, chars, length) self.strings[i].length = length @@ -123,6 +123,7 @@ cdef class StringStore: strings = file_.read().split(SEPARATOR) cdef unicode string cdef bytes byte_string + cdef int id_ for string in strings[1:]: byte_string = string.encode('utf8') - self.intern(byte_string, len(byte_string)) + self.intern(byte_string, len(byte_string), &id_) diff --git a/spacy/structs.pxd b/spacy/structs.pxd index a26c87e2f..0e0fb34b8 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -80,8 +80,6 @@ cdef struct TokenC: cdef struct Utf8Str: - id_t i - hash_t key unsigned char* chars int length