Update docstrings and API docs for StringStore

This commit is contained in:
ines 2017-05-21 14:18:58 +02:00
parent 251346b59f
commit 2c5cfe8bbf
2 changed files with 166 additions and 69 deletions

View File

@ -11,8 +11,6 @@ from preshed.maps cimport map_iter, key_t
from .typedefs cimport hash_t
from libc.stdint cimport uint32_t
import ujson
cpdef hash_t hash_string(unicode string) except 0:
chars = string.encode('utf8')
@ -72,15 +70,12 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) ex
cdef class StringStore:
"""
Map strings to and from integer IDs.
"""
"""Map strings to and from integer IDs."""
def __init__(self, strings=None, freeze=False):
"""
Create the StringStore.
"""Create the StringStore.
Arguments:
strings: A sequence of unicode strings to add to the store.
strings (iterable): A sequence of unicode strings to add to the store.
RETURNS (StringStore): The newly constructed object.
"""
self.mem = Pool()
self._map = PreshMap()
@ -106,23 +101,17 @@ cdef class StringStore:
return (StringStore, (list(self),))
def __len__(self):
"""
The number of strings in the store.
"""The number of strings in the store.
Returns:
int The number of strings in the store.
RETURNS (int): The number of strings in the store.
"""
return self.size-1
def __getitem__(self, object string_or_id):
"""
Retrieve a string from a given integer ID, or vice versa.
"""Retrieve a string from a given integer ID, or vice versa.
Arguments:
string_or_id (bytes or unicode or int):
The value to encode.
Returns:
unicode or int: The value to retrieved.
string_or_id (bytes or unicode or int): The value to encode.
Returns (unicode or int): The value to be retrieved.
"""
if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
return 0
@ -163,13 +152,10 @@ cdef class StringStore:
return utf8str - self.c
def __contains__(self, unicode string not None):
"""
Check whether a string is in the store.
"""Check whether a string is in the store.
Arguments:
string (unicode): The string to check.
Returns bool:
Whether the store contains the string.
string (unicode): The string to check.
RETURNS (bool): Whether the store contains the string.
"""
if len(string) == 0:
return True
@ -177,10 +163,9 @@ cdef class StringStore:
return self._map.get(key) is not NULL
def __iter__(self):
"""
Iterate over the strings in the store, in order.
"""Iterate over the strings in the store, in order.
Yields: unicode A string in the store.
YIELDS (unicode): A string in the store.
"""
cdef int i
for i in range(self.size):
@ -195,6 +180,41 @@ cdef class StringStore:
strings.append(py_string)
return (StringStore, (strings,), None, None, None)
def to_disk(self, path):
"""Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects.
"""
raise NotImplementedError()
def from_disk(self, path):
"""Loads state from a directory. Modifies the object in place and
returns it.
path (unicode or Path): A path to a directory. Paths may be either
strings or `Path`-like objects.
RETURNS (StringStore): The modified `StringStore` object.
"""
raise NotImplementedError()
def to_bytes(self, **exclude):
"""Serialize the current state to a binary string.
**exclude: Named attributes to prevent from being serialized.
RETURNS (bytes): The serialized form of the `StringStore` object.
"""
raise NotImplementedError()
def from_bytes(self, bytes_data, **exclude):
"""Load state from a binary string.
bytes_data (bytes): The data to load from.
**exclude: Named attributes to prevent from being loaded.
RETURNS (StringStore): The `StringStore` object.
"""
raise NotImplementedError()
def set_frozen(self, bint is_frozen):
# TODO
self.is_frozen = is_frozen
@ -235,40 +255,6 @@ cdef class StringStore:
self.size += 1
return &self.c[self.size-1]
def dump(self, file_):
"""
Save the strings to a JSON file.
Arguments:
file_ (buffer): The file to save the strings.
Returns:
None
"""
string_data = ujson.dumps(list(self))
if not isinstance(string_data, unicode):
string_data = string_data.decode('utf8')
# TODO: OOV?
file_.write(string_data)
def load(self, file_):
"""
Load the strings from a JSON file.
Arguments:
file_ (buffer): The file from which to load the strings.
Returns:
None
"""
strings = ujson.load(file_)
if strings == ['']:
return None
cdef unicode string
for string in strings:
# explicit None/len check instead of simple truth testing
# (bug in Cython <= 0.23.4)
if string is not None and len(string):
self.intern_unicode(string)
def _realloc(self):
# We want to map straight to pointers, but they'll be invalidated if
# we resize our array. So, first we remap to indices, then we resize,

View File

@ -7,12 +7,18 @@ p Map strings to and from integer IDs.
+h(2, "init") StringStore.__init__
+tag method
p Create the #[code StringStore].
p
| Create the #[code StringStore]. Note that a newly initialised store will
| always include an empty string #[code ''] at position #[code 0].
+aside-code("Example").
from spacy.strings import StringStore
stringstore = StringStore([u'apple', u'orange'])
+table(["Name", "Type", "Description"])
+row
+cell #[code strings]
+cell -
+cell iterable
+cell A sequence of unicode strings to add to the store.
+footrow
@ -25,6 +31,10 @@ p Create the #[code StringStore].
p Get the number of strings in the store.
+aside-code("Example").
stringstore = StringStore([u'apple', u'orange'])
assert len(stringstore) == 2
+table(["Name", "Type", "Description"])
+footrow
+cell returns
@ -36,22 +46,32 @@ p Get the number of strings in the store.
p Retrieve a string from a given integer ID, or vice versa.
+aside-code("Example").
stringstore = StringStore([u'apple', u'orange'])
int_id = stringstore[u'apple'] # 1
assert stringstore[int_id] == u'apple'
+table(["Name", "Type", "Description"])
+row
+cell #[code string_or_id]
+cell bytes / unicode / int
+cell bytes, unicode or int
+cell The value to encode.
+footrow
+cell returns
+cell unicode / int
+cell The value to retrieved.
+cell unicode or int
+cell The value to be retrieved.
+h(2, "contains") StringStore.__contains__
+tag method
p Check whether a string is in the store.
+aside-code("Example").
stringstore = StringStore([u'apple', u'orange'])
assert u'apple' in stringstore == True
assert u'cherry' in stringstore == False
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
@ -66,10 +86,101 @@ p Check whether a string is in the store.
+h(2, "iter") StringStore.__iter__
+tag method
p Iterate over the strings in the store, in order.
p
| Iterate over the strings in the store, in order. Note that a newly
| initialised store will always include an empty string #[code ''] at
| position #[code 0].
+aside-code("Example").
stringstore = StringStore([u'apple', u'orange'])
all_strings = [s for s in stringstore]
assert all_strings == [u'', u'apple', u'orange']
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell unicode
+cell A string in the store.
+h(2, "to_disk") StringStore.to_disk
+tag method
p Save the current state to a directory.
+aside-code("Example").
stringstore.to_disk('/path/to/strings')
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode or #[code Path]
+cell
| A path to a directory, which will be created if it doesn't exist.
| Paths may be either strings or #[code Path]-like objects.
+h(2, "from_disk") Tokenizer.from_disk
+tag method
p Loads state from a directory. Modifies the object in place and returns it.
+aside-code("Example").
from spacy.strings import StringStore
stringstore = StringStore().from_disk('/path/to/strings')
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode or #[code Path]
+cell
| A path to a directory. Paths may be either strings or
| #[code Path]-like objects.
+footrow
+cell returns
+cell #[code Tokenizer]
+cell The modified #[code Tokenizer] object.
+h(2, "to_bytes") Tokenizer.to_bytes
+tag method
p Serialize the current state to a binary string.
+aside-code("Example").
store_bytes = stringstore.to_bytes()
+table(["Name", "Type", "Description"])
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being serialized.
+footrow
+cell returns
+cell bytes
+cell The serialized form of the #[code Tokenizer] object.
+h(2, "from_bytes") Tokenizer.from_bytes
+tag method
p Load state from a binary string.
+aside-code("Example").
fron spacy.strings import StringStore
store_bytes = stringstore.to_bytes()
new_store = StringStore().from_bytes(store_bytes)
+table(["Name", "Type", "Description"])
+row
+cell #[code bytes_data]
+cell bytes
+cell The data to load from.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being loaded.
+footrow
+cell returns
+cell #[code StringStore]
+cell The #[code StringStore] object.