mirror of https://github.com/explosion/spaCy.git
Update docstrings and API docs for StringStore
This commit is contained in:
parent
251346b59f
commit
2c5cfe8bbf
|
@ -11,8 +11,6 @@ from preshed.maps cimport map_iter, key_t
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
from libc.stdint cimport uint32_t
|
from libc.stdint cimport uint32_t
|
||||||
|
|
||||||
import ujson
|
|
||||||
|
|
||||||
|
|
||||||
cpdef hash_t hash_string(unicode string) except 0:
|
cpdef hash_t hash_string(unicode string) except 0:
|
||||||
chars = string.encode('utf8')
|
chars = string.encode('utf8')
|
||||||
|
@ -72,15 +70,12 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) ex
|
||||||
|
|
||||||
|
|
||||||
cdef class StringStore:
|
cdef class StringStore:
|
||||||
"""
|
"""Map strings to and from integer IDs."""
|
||||||
Map strings to and from integer IDs.
|
|
||||||
"""
|
|
||||||
def __init__(self, strings=None, freeze=False):
|
def __init__(self, strings=None, freeze=False):
|
||||||
"""
|
"""Create the StringStore.
|
||||||
Create the StringStore.
|
|
||||||
|
|
||||||
Arguments:
|
strings (iterable): A sequence of unicode strings to add to the store.
|
||||||
strings: A sequence of unicode strings to add to the store.
|
RETURNS (StringStore): The newly constructed object.
|
||||||
"""
|
"""
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._map = PreshMap()
|
self._map = PreshMap()
|
||||||
|
@ -106,23 +101,17 @@ cdef class StringStore:
|
||||||
return (StringStore, (list(self),))
|
return (StringStore, (list(self),))
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""
|
"""The number of strings in the store.
|
||||||
The number of strings in the store.
|
|
||||||
|
|
||||||
Returns:
|
RETURNS (int): The number of strings in the store.
|
||||||
int The number of strings in the store.
|
|
||||||
"""
|
"""
|
||||||
return self.size-1
|
return self.size-1
|
||||||
|
|
||||||
def __getitem__(self, object string_or_id):
|
def __getitem__(self, object string_or_id):
|
||||||
"""
|
"""Retrieve a string from a given integer ID, or vice versa.
|
||||||
Retrieve a string from a given integer ID, or vice versa.
|
|
||||||
|
|
||||||
Arguments:
|
string_or_id (bytes or unicode or int): The value to encode.
|
||||||
string_or_id (bytes or unicode or int):
|
Returns (unicode or int): The value to be retrieved.
|
||||||
The value to encode.
|
|
||||||
Returns:
|
|
||||||
unicode or int: The value to retrieved.
|
|
||||||
"""
|
"""
|
||||||
if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
|
if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
|
||||||
return 0
|
return 0
|
||||||
|
@ -163,13 +152,10 @@ cdef class StringStore:
|
||||||
return utf8str - self.c
|
return utf8str - self.c
|
||||||
|
|
||||||
def __contains__(self, unicode string not None):
|
def __contains__(self, unicode string not None):
|
||||||
"""
|
"""Check whether a string is in the store.
|
||||||
Check whether a string is in the store.
|
|
||||||
|
|
||||||
Arguments:
|
|
||||||
string (unicode): The string to check.
|
string (unicode): The string to check.
|
||||||
Returns bool:
|
RETURNS (bool): Whether the store contains the string.
|
||||||
Whether the store contains the string.
|
|
||||||
"""
|
"""
|
||||||
if len(string) == 0:
|
if len(string) == 0:
|
||||||
return True
|
return True
|
||||||
|
@ -177,10 +163,9 @@ cdef class StringStore:
|
||||||
return self._map.get(key) is not NULL
|
return self._map.get(key) is not NULL
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
"""
|
"""Iterate over the strings in the store, in order.
|
||||||
Iterate over the strings in the store, in order.
|
|
||||||
|
|
||||||
Yields: unicode A string in the store.
|
YIELDS (unicode): A string in the store.
|
||||||
"""
|
"""
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(self.size):
|
for i in range(self.size):
|
||||||
|
@ -195,6 +180,41 @@ cdef class StringStore:
|
||||||
strings.append(py_string)
|
strings.append(py_string)
|
||||||
return (StringStore, (strings,), None, None, None)
|
return (StringStore, (strings,), None, None, None)
|
||||||
|
|
||||||
|
def to_disk(self, path):
|
||||||
|
"""Save the current state to a directory.
|
||||||
|
|
||||||
|
path (unicode or Path): A path to a directory, which will be created if
|
||||||
|
it doesn't exist. Paths may be either strings or `Path`-like objects.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def from_disk(self, path):
|
||||||
|
"""Loads state from a directory. Modifies the object in place and
|
||||||
|
returns it.
|
||||||
|
|
||||||
|
path (unicode or Path): A path to a directory. Paths may be either
|
||||||
|
strings or `Path`-like objects.
|
||||||
|
RETURNS (StringStore): The modified `StringStore` object.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def to_bytes(self, **exclude):
|
||||||
|
"""Serialize the current state to a binary string.
|
||||||
|
|
||||||
|
**exclude: Named attributes to prevent from being serialized.
|
||||||
|
RETURNS (bytes): The serialized form of the `StringStore` object.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def from_bytes(self, bytes_data, **exclude):
|
||||||
|
"""Load state from a binary string.
|
||||||
|
|
||||||
|
bytes_data (bytes): The data to load from.
|
||||||
|
**exclude: Named attributes to prevent from being loaded.
|
||||||
|
RETURNS (StringStore): The `StringStore` object.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
def set_frozen(self, bint is_frozen):
|
def set_frozen(self, bint is_frozen):
|
||||||
# TODO
|
# TODO
|
||||||
self.is_frozen = is_frozen
|
self.is_frozen = is_frozen
|
||||||
|
@ -235,40 +255,6 @@ cdef class StringStore:
|
||||||
self.size += 1
|
self.size += 1
|
||||||
return &self.c[self.size-1]
|
return &self.c[self.size-1]
|
||||||
|
|
||||||
def dump(self, file_):
|
|
||||||
"""
|
|
||||||
Save the strings to a JSON file.
|
|
||||||
|
|
||||||
Arguments:
|
|
||||||
file_ (buffer): The file to save the strings.
|
|
||||||
Returns:
|
|
||||||
None
|
|
||||||
"""
|
|
||||||
string_data = ujson.dumps(list(self))
|
|
||||||
if not isinstance(string_data, unicode):
|
|
||||||
string_data = string_data.decode('utf8')
|
|
||||||
# TODO: OOV?
|
|
||||||
file_.write(string_data)
|
|
||||||
|
|
||||||
def load(self, file_):
|
|
||||||
"""
|
|
||||||
Load the strings from a JSON file.
|
|
||||||
|
|
||||||
Arguments:
|
|
||||||
file_ (buffer): The file from which to load the strings.
|
|
||||||
Returns:
|
|
||||||
None
|
|
||||||
"""
|
|
||||||
strings = ujson.load(file_)
|
|
||||||
if strings == ['']:
|
|
||||||
return None
|
|
||||||
cdef unicode string
|
|
||||||
for string in strings:
|
|
||||||
# explicit None/len check instead of simple truth testing
|
|
||||||
# (bug in Cython <= 0.23.4)
|
|
||||||
if string is not None and len(string):
|
|
||||||
self.intern_unicode(string)
|
|
||||||
|
|
||||||
def _realloc(self):
|
def _realloc(self):
|
||||||
# We want to map straight to pointers, but they'll be invalidated if
|
# We want to map straight to pointers, but they'll be invalidated if
|
||||||
# we resize our array. So, first we remap to indices, then we resize,
|
# we resize our array. So, first we remap to indices, then we resize,
|
||||||
|
|
|
@ -7,12 +7,18 @@ p Map strings to and from integer IDs.
|
||||||
+h(2, "init") StringStore.__init__
|
+h(2, "init") StringStore.__init__
|
||||||
+tag method
|
+tag method
|
||||||
|
|
||||||
p Create the #[code StringStore].
|
p
|
||||||
|
| Create the #[code StringStore]. Note that a newly initialised store will
|
||||||
|
| always include an empty string #[code ''] at position #[code 0].
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
from spacy.strings import StringStore
|
||||||
|
stringstore = StringStore([u'apple', u'orange'])
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
+cell #[code strings]
|
+cell #[code strings]
|
||||||
+cell -
|
+cell iterable
|
||||||
+cell A sequence of unicode strings to add to the store.
|
+cell A sequence of unicode strings to add to the store.
|
||||||
|
|
||||||
+footrow
|
+footrow
|
||||||
|
@ -25,6 +31,10 @@ p Create the #[code StringStore].
|
||||||
|
|
||||||
p Get the number of strings in the store.
|
p Get the number of strings in the store.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
stringstore = StringStore([u'apple', u'orange'])
|
||||||
|
assert len(stringstore) == 2
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+footrow
|
+footrow
|
||||||
+cell returns
|
+cell returns
|
||||||
|
@ -36,22 +46,32 @@ p Get the number of strings in the store.
|
||||||
|
|
||||||
p Retrieve a string from a given integer ID, or vice versa.
|
p Retrieve a string from a given integer ID, or vice versa.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
stringstore = StringStore([u'apple', u'orange'])
|
||||||
|
int_id = stringstore[u'apple'] # 1
|
||||||
|
assert stringstore[int_id] == u'apple'
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
+cell #[code string_or_id]
|
+cell #[code string_or_id]
|
||||||
+cell bytes / unicode / int
|
+cell bytes, unicode or int
|
||||||
+cell The value to encode.
|
+cell The value to encode.
|
||||||
|
|
||||||
+footrow
|
+footrow
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell unicode / int
|
+cell unicode or int
|
||||||
+cell The value to retrieved.
|
+cell The value to be retrieved.
|
||||||
|
|
||||||
+h(2, "contains") StringStore.__contains__
|
+h(2, "contains") StringStore.__contains__
|
||||||
+tag method
|
+tag method
|
||||||
|
|
||||||
p Check whether a string is in the store.
|
p Check whether a string is in the store.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
stringstore = StringStore([u'apple', u'orange'])
|
||||||
|
assert u'apple' in stringstore == True
|
||||||
|
assert u'cherry' in stringstore == False
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
+cell #[code string]
|
+cell #[code string]
|
||||||
|
@ -66,10 +86,101 @@ p Check whether a string is in the store.
|
||||||
+h(2, "iter") StringStore.__iter__
|
+h(2, "iter") StringStore.__iter__
|
||||||
+tag method
|
+tag method
|
||||||
|
|
||||||
p Iterate over the strings in the store, in order.
|
p
|
||||||
|
| Iterate over the strings in the store, in order. Note that a newly
|
||||||
|
| initialised store will always include an empty string #[code ''] at
|
||||||
|
| position #[code 0].
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
stringstore = StringStore([u'apple', u'orange'])
|
||||||
|
all_strings = [s for s in stringstore]
|
||||||
|
assert all_strings == [u'', u'apple', u'orange']
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+footrow
|
+footrow
|
||||||
+cell yields
|
+cell yields
|
||||||
+cell unicode
|
+cell unicode
|
||||||
+cell A string in the store.
|
+cell A string in the store.
|
||||||
|
|
||||||
|
+h(2, "to_disk") StringStore.to_disk
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p Save the current state to a directory.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
stringstore.to_disk('/path/to/strings')
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code path]
|
||||||
|
+cell unicode or #[code Path]
|
||||||
|
+cell
|
||||||
|
| A path to a directory, which will be created if it doesn't exist.
|
||||||
|
| Paths may be either strings or #[code Path]-like objects.
|
||||||
|
|
||||||
|
+h(2, "from_disk") Tokenizer.from_disk
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p Loads state from a directory. Modifies the object in place and returns it.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
from spacy.strings import StringStore
|
||||||
|
stringstore = StringStore().from_disk('/path/to/strings')
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code path]
|
||||||
|
+cell unicode or #[code Path]
|
||||||
|
+cell
|
||||||
|
| A path to a directory. Paths may be either strings or
|
||||||
|
| #[code Path]-like objects.
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell returns
|
||||||
|
+cell #[code Tokenizer]
|
||||||
|
+cell The modified #[code Tokenizer] object.
|
||||||
|
|
||||||
|
+h(2, "to_bytes") Tokenizer.to_bytes
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p Serialize the current state to a binary string.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
store_bytes = stringstore.to_bytes()
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code **exclude]
|
||||||
|
+cell -
|
||||||
|
+cell Named attributes to prevent from being serialized.
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell returns
|
||||||
|
+cell bytes
|
||||||
|
+cell The serialized form of the #[code Tokenizer] object.
|
||||||
|
|
||||||
|
+h(2, "from_bytes") Tokenizer.from_bytes
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p Load state from a binary string.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
fron spacy.strings import StringStore
|
||||||
|
store_bytes = stringstore.to_bytes()
|
||||||
|
new_store = StringStore().from_bytes(store_bytes)
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code bytes_data]
|
||||||
|
+cell bytes
|
||||||
|
+cell The data to load from.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code **exclude]
|
||||||
|
+cell -
|
||||||
|
+cell Named attributes to prevent from being loaded.
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell returns
|
||||||
|
+cell #[code StringStore]
|
||||||
|
+cell The #[code StringStore] object.
|
||||||
|
|
Loading…
Reference in New Issue