diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 38afd7f02..e993f1423 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -11,8 +11,6 @@ from preshed.maps cimport map_iter, key_t from .typedefs cimport hash_t from libc.stdint cimport uint32_t -import ujson - cpdef hash_t hash_string(unicode string) except 0: chars = string.encode('utf8') @@ -72,15 +70,12 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) ex cdef class StringStore: - """ - Map strings to and from integer IDs. - """ + """Map strings to and from integer IDs.""" def __init__(self, strings=None, freeze=False): - """ - Create the StringStore. + """Create the StringStore. - Arguments: - strings: A sequence of unicode strings to add to the store. + strings (iterable): A sequence of unicode strings to add to the store. + RETURNS (StringStore): The newly constructed object. """ self.mem = Pool() self._map = PreshMap() @@ -106,23 +101,17 @@ cdef class StringStore: return (StringStore, (list(self),)) def __len__(self): - """ - The number of strings in the store. + """The number of strings in the store. - Returns: - int The number of strings in the store. + RETURNS (int): The number of strings in the store. """ return self.size-1 def __getitem__(self, object string_or_id): - """ - Retrieve a string from a given integer ID, or vice versa. + """Retrieve a string from a given integer ID, or vice versa. - Arguments: - string_or_id (bytes or unicode or int): - The value to encode. - Returns: - unicode or int: The value to retrieved. + string_or_id (bytes or unicode or int): The value to encode. + Returns (unicode or int): The value to be retrieved. """ if isinstance(string_or_id, basestring) and len(string_or_id) == 0: return 0 @@ -163,13 +152,10 @@ cdef class StringStore: return utf8str - self.c def __contains__(self, unicode string not None): - """ - Check whether a string is in the store. + """Check whether a string is in the store. - Arguments: - string (unicode): The string to check. - Returns bool: - Whether the store contains the string. + string (unicode): The string to check. + RETURNS (bool): Whether the store contains the string. """ if len(string) == 0: return True @@ -177,10 +163,9 @@ cdef class StringStore: return self._map.get(key) is not NULL def __iter__(self): - """ - Iterate over the strings in the store, in order. + """Iterate over the strings in the store, in order. - Yields: unicode A string in the store. + YIELDS (unicode): A string in the store. """ cdef int i for i in range(self.size): @@ -195,6 +180,41 @@ cdef class StringStore: strings.append(py_string) return (StringStore, (strings,), None, None, None) + def to_disk(self, path): + """Save the current state to a directory. + + path (unicode or Path): A path to a directory, which will be created if + it doesn't exist. Paths may be either strings or `Path`-like objects. + """ + raise NotImplementedError() + + def from_disk(self, path): + """Loads state from a directory. Modifies the object in place and + returns it. + + path (unicode or Path): A path to a directory. Paths may be either + strings or `Path`-like objects. + RETURNS (StringStore): The modified `StringStore` object. + """ + raise NotImplementedError() + + def to_bytes(self, **exclude): + """Serialize the current state to a binary string. + + **exclude: Named attributes to prevent from being serialized. + RETURNS (bytes): The serialized form of the `StringStore` object. + """ + raise NotImplementedError() + + def from_bytes(self, bytes_data, **exclude): + """Load state from a binary string. + + bytes_data (bytes): The data to load from. + **exclude: Named attributes to prevent from being loaded. + RETURNS (StringStore): The `StringStore` object. + """ + raise NotImplementedError() + def set_frozen(self, bint is_frozen): # TODO self.is_frozen = is_frozen @@ -235,40 +255,6 @@ cdef class StringStore: self.size += 1 return &self.c[self.size-1] - def dump(self, file_): - """ - Save the strings to a JSON file. - - Arguments: - file_ (buffer): The file to save the strings. - Returns: - None - """ - string_data = ujson.dumps(list(self)) - if not isinstance(string_data, unicode): - string_data = string_data.decode('utf8') - # TODO: OOV? - file_.write(string_data) - - def load(self, file_): - """ - Load the strings from a JSON file. - - Arguments: - file_ (buffer): The file from which to load the strings. - Returns: - None - """ - strings = ujson.load(file_) - if strings == ['']: - return None - cdef unicode string - for string in strings: - # explicit None/len check instead of simple truth testing - # (bug in Cython <= 0.23.4) - if string is not None and len(string): - self.intern_unicode(string) - def _realloc(self): # We want to map straight to pointers, but they'll be invalidated if # we resize our array. So, first we remap to indices, then we resize, diff --git a/website/docs/api/stringstore.jade b/website/docs/api/stringstore.jade index 8158a2ef7..5f5912edd 100644 --- a/website/docs/api/stringstore.jade +++ b/website/docs/api/stringstore.jade @@ -7,12 +7,18 @@ p Map strings to and from integer IDs. +h(2, "init") StringStore.__init__ +tag method -p Create the #[code StringStore]. +p + | Create the #[code StringStore]. Note that a newly initialised store will + | always include an empty string #[code ''] at position #[code 0]. + ++aside-code("Example"). + from spacy.strings import StringStore + stringstore = StringStore([u'apple', u'orange']) +table(["Name", "Type", "Description"]) +row +cell #[code strings] - +cell - + +cell iterable +cell A sequence of unicode strings to add to the store. +footrow @@ -25,6 +31,10 @@ p Create the #[code StringStore]. p Get the number of strings in the store. ++aside-code("Example"). + stringstore = StringStore([u'apple', u'orange']) + assert len(stringstore) == 2 + +table(["Name", "Type", "Description"]) +footrow +cell returns @@ -36,22 +46,32 @@ p Get the number of strings in the store. p Retrieve a string from a given integer ID, or vice versa. ++aside-code("Example"). + stringstore = StringStore([u'apple', u'orange']) + int_id = stringstore[u'apple'] # 1 + assert stringstore[int_id] == u'apple' + +table(["Name", "Type", "Description"]) +row +cell #[code string_or_id] - +cell bytes / unicode / int + +cell bytes, unicode or int +cell The value to encode. +footrow +cell returns - +cell unicode / int - +cell The value to retrieved. + +cell unicode or int + +cell The value to be retrieved. +h(2, "contains") StringStore.__contains__ +tag method p Check whether a string is in the store. ++aside-code("Example"). + stringstore = StringStore([u'apple', u'orange']) + assert u'apple' in stringstore == True + assert u'cherry' in stringstore == False + +table(["Name", "Type", "Description"]) +row +cell #[code string] @@ -66,10 +86,101 @@ p Check whether a string is in the store. +h(2, "iter") StringStore.__iter__ +tag method -p Iterate over the strings in the store, in order. +p + | Iterate over the strings in the store, in order. Note that a newly + | initialised store will always include an empty string #[code ''] at + | position #[code 0]. + ++aside-code("Example"). + stringstore = StringStore([u'apple', u'orange']) + all_strings = [s for s in stringstore] + assert all_strings == [u'', u'apple', u'orange'] +table(["Name", "Type", "Description"]) +footrow +cell yields +cell unicode +cell A string in the store. + ++h(2, "to_disk") StringStore.to_disk + +tag method + +p Save the current state to a directory. + ++aside-code("Example"). + stringstore.to_disk('/path/to/strings') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code path] + +cell unicode or #[code Path] + +cell + | A path to a directory, which will be created if it doesn't exist. + | Paths may be either strings or #[code Path]-like objects. + ++h(2, "from_disk") Tokenizer.from_disk + +tag method + +p Loads state from a directory. Modifies the object in place and returns it. + ++aside-code("Example"). + from spacy.strings import StringStore + stringstore = StringStore().from_disk('/path/to/strings') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code path] + +cell unicode or #[code Path] + +cell + | A path to a directory. Paths may be either strings or + | #[code Path]-like objects. + + +footrow + +cell returns + +cell #[code Tokenizer] + +cell The modified #[code Tokenizer] object. + ++h(2, "to_bytes") Tokenizer.to_bytes + +tag method + +p Serialize the current state to a binary string. + ++aside-code("Example"). + store_bytes = stringstore.to_bytes() + ++table(["Name", "Type", "Description"]) + +row + +cell #[code **exclude] + +cell - + +cell Named attributes to prevent from being serialized. + + +footrow + +cell returns + +cell bytes + +cell The serialized form of the #[code Tokenizer] object. + ++h(2, "from_bytes") Tokenizer.from_bytes + +tag method + +p Load state from a binary string. + ++aside-code("Example"). + fron spacy.strings import StringStore + store_bytes = stringstore.to_bytes() + new_store = StringStore().from_bytes(store_bytes) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code bytes_data] + +cell bytes + +cell The data to load from. + + +row + +cell #[code **exclude] + +cell - + +cell Named attributes to prevent from being loaded. + + +footrow + +cell returns + +cell #[code StringStore] + +cell The #[code StringStore] object.