From 7d48bba6c4392ef58d2e968b012adefb4f0d17e6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Dec 2014 06:42:01 +1100 Subject: [PATCH] * Move StringStore class to its own file --- spacy/strings.pxd | 15 +++++++++ spacy/strings.pyx | 84 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+) create mode 100644 spacy/strings.pxd create mode 100644 spacy/strings.pyx diff --git a/spacy/strings.pxd b/spacy/strings.pxd new file mode 100644 index 000000000..2556a1be9 --- /dev/null +++ b/spacy/strings.pxd @@ -0,0 +1,15 @@ +from cymem.cymem cimport Pool +from preshed.maps cimport PreshMap + +from .structs cimport Utf8Str + + +cdef class StringStore: + cdef Pool mem + cdef Utf8Str* strings + cdef size_t size + + cdef PreshMap _map + cdef size_t _resize_at + + cdef const Utf8Str* intern(self, char* chars, int length) except NULL diff --git a/spacy/strings.pyx b/spacy/strings.pyx new file mode 100644 index 000000000..e6f0b2db0 --- /dev/null +++ b/spacy/strings.pyx @@ -0,0 +1,84 @@ +import codecs + +from libc.string cimport memcpy + +from murmurhash.mrmr cimport hash64 + +from .typedefs cimport hash_t + + +SEPARATOR = '\n|-SEP-|\n' + + +cdef class StringStore: + def __init__(self): + self.mem = Pool() + self._map = PreshMap() + self._resize_at = 10000 + self.strings = self.mem.alloc(self._resize_at, sizeof(Utf8Str)) + self.size = 1 + + property size: + def __get__(self): + return self.size-1 + + def __getitem__(self, object string_or_id): + cdef bytes byte_string + cdef const Utf8Str* utf8str + if isinstance(string_or_id, int) or isinstance(string_or_id, long): + if string_or_id < 1 or string_or_id >= self.size: + raise IndexError(string_or_id) + utf8str = &self.strings[string_or_id] + return utf8str.chars[:utf8str.length] + elif isinstance(string_or_id, bytes): + utf8str = self.intern(string_or_id, len(string_or_id)) + return utf8str.i + elif isinstance(string_or_id, unicode): + byte_string = string_or_id.encode('utf8') + utf8str = self.intern(byte_string, len(byte_string)) + return utf8str.i + else: + raise TypeError(type(string_or_id)) + + cdef const Utf8Str* intern(self, char* chars, int length) except NULL: + # 0 means missing, but we don't bother offsetting the index. We waste + # slot 0 to simplify the code, because it doesn't matter. + assert length != 0 + cdef hash_t key = hash64(chars, length * sizeof(char), 0) + cdef void* value = self._map.get(key) + cdef size_t i + if value == NULL: + if self.size == self._resize_at: + self._resize_at *= 2 + self.strings = self.mem.realloc(self.strings, self._resize_at * sizeof(Utf8Str)) + i = self.size + self.strings[i].i = self.size + self.strings[i].key = key + self.strings[i].chars = self.mem.alloc(length, sizeof(char)) + memcpy(self.strings[i].chars, chars, length) + self.strings[i].length = length + self._map.set(key, self.size) + self.size += 1 + else: + i = value + return &self.strings[i] + + def dump(self, loc): + strings = [] + cdef Utf8Str* string + cdef bytes py_string + for i in range(self.size): + string = &self.strings[i] + py_string = string.chars[:string.length] + strings.append(py_string.decode('utf8')) + with codecs.open(loc, 'w', 'utf8') as file_: + file_.write(SEPARATOR.join(strings)) + + def load(self, loc): + with codecs.open(loc, 'r', 'utf8') as file_: + strings = file_.read().split(SEPARATOR) + cdef unicode string + cdef bytes byte_string + for string in strings[1:]: + byte_string = string.encode('utf8') + self.intern(byte_string, len(byte_string))