diff --git a/spacy/strings.pxd b/spacy/strings.pxd index d22f48ba1..bd5e0f135 100644 --- a/spacy/strings.pxd +++ b/spacy/strings.pxd @@ -25,5 +25,8 @@ cdef class StringStore: cdef vector[hash_t] keys cdef public PreshMap _map - cdef const Utf8Str* intern_unicode(self, str py_string) - cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash) + cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient) + cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient) + cdef vector[hash_t] _transient_keys + cdef PreshMap _transient_map + cdef Pool _non_temp_mem diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 376a13175..5e0bd90c6 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -1,6 +1,10 @@ # cython: infer_types=True # cython: profile=False cimport cython + +from contextlib import contextmanager +from typing import Iterator, List, Optional + from libc.stdint cimport uint32_t from libc.string cimport memcpy from murmurhash.mrmr cimport hash32, hash64 @@ -31,7 +35,7 @@ def get_string_id(key): This function optimises for convenience over performance, so shouldn't be used in tight loops. """ - cdef hash_t str_hash + cdef hash_t str_hash if isinstance(key, str): if len(key) == 0: return 0 @@ -45,8 +49,8 @@ def get_string_id(key): elif _try_coerce_to_hash(key, &str_hash): # Coerce the integral key to the expected primitive hash type. # This ensures that custom/overloaded "primitive" data types - # such as those implemented by numpy are not inadvertently used - # downsteam (as these are internally implemented as custom PyObjects + # such as those implemented by numpy are not inadvertently used + # downsteam (as these are internally implemented as custom PyObjects # whose comparison operators can incur a significant overhead). return str_hash else: @@ -119,7 +123,9 @@ cdef class StringStore: strings (iterable): A sequence of unicode strings to add to the store. """ self.mem = Pool() + self._non_temp_mem = self.mem self._map = PreshMap() + self._transient_map = None if strings is not None: for string in strings: self.add(string) @@ -152,10 +158,13 @@ cdef class StringStore: return SYMBOLS_BY_INT[str_hash] else: utf8str = self._map.get(str_hash) + if utf8str is NULL and self._transient_map is not None: + utf8str = self._transient_map.get(str_hash) else: # TODO: Raise an error instead utf8str = self._map.get(string_or_id) - + if utf8str is NULL and self._transient_map is not None: + utf8str = self._transient_map.get(str_hash) if utf8str is NULL: raise KeyError(Errors.E018.format(hash_value=string_or_id)) else: @@ -175,10 +184,46 @@ cdef class StringStore: else: return self[key] - def add(self, string): + def __reduce__(self): + strings = list(self.non_transient_keys()) + return (StringStore, (strings,), None, None, None) + + def __len__(self) -> int: + """The number of strings in the store. + + RETURNS (int): The number of strings in the store. + """ + return self._keys.size() + self._transient_keys.size() + + @contextmanager + def memory_zone(self, mem: Optional[Pool] = None) -> Pool: + """Begin a block where all resources allocated during the block will + be freed at the end of it. If a resources was created within the + memory zone block, accessing it outside the block is invalid. + Behaviour of this invalid access is undefined. Memory zones should + not be nested. + + The memory zone is helpful for services that need to process large + volumes of text with a defined memory budget. + """ + if mem is None: + mem = Pool() + self.mem = mem + self._transient_map = PreshMap() + yield mem + self.mem = self._non_temp_mem + self._transient_map = None + self._transient_keys.clear() + + def add(self, string: str, allow_transient: bool = False) -> int: """Add a string to the StringStore. string (str): The string to add. + allow_transient (bool): Allow the string to be stored in the 'transient' + map, which will be flushed at the end of the memory zone. Strings + encountered during arbitrary text processing should be added + with allow_transient=True, while labels and other strings used + internally should not. RETURNS (uint64): The string's hash value. """ cdef hash_t str_hash @@ -188,22 +233,26 @@ cdef class StringStore: string = string.encode("utf8") str_hash = hash_utf8(string, len(string)) - self._intern_utf8(string, len(string), &str_hash) + self._intern_utf8(string, len(string), &str_hash, allow_transient) elif isinstance(string, bytes): if string in SYMBOLS_BY_STR: return SYMBOLS_BY_STR[string] str_hash = hash_utf8(string, len(string)) - self._intern_utf8(string, len(string), &str_hash) + self._intern_utf8(string, len(string), &str_hash, allow_transient) else: raise TypeError(Errors.E017.format(value_type=type(string))) return str_hash def __len__(self): """The number of strings in the store. + if string in SYMBOLS_BY_STR: + return SYMBOLS_BY_STR[string] + else: + return self._intern_str(string, allow_transient) RETURNS (int): The number of strings in the store. """ - return self.keys.size() + return self.keys.size() + self._transient_keys.size() def __contains__(self, string_or_id not None): """Check whether a string or ID is in the store. @@ -222,30 +271,70 @@ cdef class StringStore: pass else: # TODO: Raise an error instead - return self._map.get(string_or_id) is not NULL - + if self._map.get(string_or_id) is not NULL: + return True + elif self._transient_map is not None and self._transient_map.get(string_or_id) is not NULL: + return True + else: + return False if str_hash < len(SYMBOLS_BY_INT): return True else: - return self._map.get(str_hash) is not NULL + if self._map.get(str_hash) is not NULL: + return True + elif self._transient_map is not None and self._transient_map.get(string_or_id) is not NULL: + return True + else: + return False def __iter__(self): """Iterate over the strings in the store, in order. YIELDS (str): A string in the store. """ + yield from self.non_transient_keys() + yield from self.transient_keys() + + def non_transient_keys(self) -> Iterator[str]: + """Iterate over the stored strings in insertion order. + + RETURNS: A list of strings. + """ cdef int i cdef hash_t key for i in range(self.keys.size()): key = self.keys[i] utf8str = self._map.get(key) yield decode_Utf8Str(utf8str) - # TODO: Iterate OOV here? def __reduce__(self): strings = list(self) return (StringStore, (strings,), None, None, None) + def transient_keys(self) -> Iterator[str]: + if self._transient_map is None: + return [] + for i in range(self._transient_keys.size()): + utf8str = self._transient_map.get(self._transient_keys[i]) + yield decode_Utf8Str(utf8str) + + def values(self) -> List[int]: + """Iterate over the stored strings hashes in insertion order. + + RETURNS: A list of string hashs. + """ + cdef int i + hashes = [None] * self._keys.size() + for i in range(self._keys.size()): + hashes[i] = self._keys[i] + if self._transient_map is not None: + transient_hashes = [None] * self._transient_keys.size() + for i in range(self._transient_keys.size()): + transient_hashes[i] = self._transient_keys[i] + else: + transient_hashes = [] + return hashes + transient_hashes + def to_disk(self, path): """Save the current state to a directory. @@ -269,7 +358,7 @@ cdef class StringStore: prev = list(self) self._reset_and_load(strings) for word in prev: - self.add(word) + self.add(word, allow_transient=False) return self def to_bytes(self, **kwargs): @@ -289,7 +378,7 @@ cdef class StringStore: prev = list(self) self._reset_and_load(strings) for word in prev: - self.add(word) + self.add(word, allow_transient=False) return self def _reset_and_load(self, strings): @@ -297,22 +386,34 @@ cdef class StringStore: self._map = PreshMap() self.keys.clear() for string in strings: - self.add(string) + self.add(string, allow_transient=False) - cdef const Utf8Str* intern_unicode(self, str py_string): + cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient): # 0 means missing, but we don't bother offsetting the index. cdef bytes byte_string = py_string.encode("utf8") - return self._intern_utf8(byte_string, len(byte_string), NULL) + return self._intern_utf8(byte_string, len(byte_string), NULL, allow_transient) @cython.final - cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash): + cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient): # TODO: This function's API/behaviour is an unholy mess... # 0 means missing, but we don't bother offsetting the index. cdef hash_t key = precalculated_hash[0] if precalculated_hash is not NULL else hash_utf8(utf8_string, length) cdef Utf8Str* value = self._map.get(key) if value is not NULL: return value + if allow_transient and self._transient_map is not None: + # If we've already allocated a transient string, and now we + # want to intern it permanently, we'll end up with the string + # in both places. That seems fine -- I don't see why we need + # to remove it from the transient map. + value = self._transient_map.get(key) + if value is not NULL: + return value value = _allocate(self.mem, utf8_string, length) - self._map.set(key, value) - self.keys.push_back(key) + if allow_transient and self._transient_map is not None: + self._transient_map.set(key, value) + self._transient_keys.push_back(key) + else: + self._map.set(key, value) + self.keys.push_back(key) return value diff --git a/spacy/tests/vocab_vectors/test_memory_zone.py b/spacy/tests/vocab_vectors/test_memory_zone.py new file mode 100644 index 000000000..910d2664e --- /dev/null +++ b/spacy/tests/vocab_vectors/test_memory_zone.py @@ -0,0 +1,36 @@ +from spacy.vocab import Vocab + + +def test_memory_zone_no_insertion(): + vocab = Vocab() + with vocab.memory_zone(): + pass + lex = vocab["horse"] + assert lex.text == "horse" + + +def test_memory_zone_insertion(): + vocab = Vocab() + _ = vocab["dog"] + assert "dog" in vocab + assert "horse" not in vocab + with vocab.memory_zone(): + lex = vocab["horse"] + assert lex.text == "horse" + assert "dog" in vocab + assert "horse" not in vocab + + +def test_memory_zone_redundant_insertion(): + """Test that if we insert an already-existing word while + in the memory zone, it stays persistent""" + vocab = Vocab() + _ = vocab["dog"] + assert "dog" in vocab + assert "horse" not in vocab + with vocab.memory_zone(): + lex = vocab["horse"] + assert lex.text == "horse" + _ = vocab["dog"] + assert "dog" in vocab + assert "horse" not in vocab diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index a902ebad9..88e4b06b0 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -25,9 +25,7 @@ cdef class Tokenizer: cdef PhraseMatcher _special_matcher # TODO convert to bool in v4 cdef int _faster_heuristics - # TODO next one is unused and should be removed in v4 - # https://github.com/explosion/spaCy/pull/9150 - cdef int _unused_int2 + cdef public int max_cache_size cdef Doc _tokenize_affixes(self, str string, bint with_special_cases) cdef int _apply_special_cases(self, Doc doc) except -1 diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 96545828f..93b7f63ac 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -30,7 +30,7 @@ cdef class Tokenizer: """ def __init__(self, Vocab vocab, rules=None, prefix_search=None, suffix_search=None, infix_finditer=None, token_match=None, - url_match=None, faster_heuristics=True): + url_match=None, faster_heuristics=True, max_cache_size=10000): """Create a `Tokenizer`, to create `Doc` objects given unicode text. vocab (Vocab): A storage container for lexical types. @@ -50,6 +50,7 @@ cdef class Tokenizer: faster_heuristics (bool): Whether to restrict the final Matcher-based pass for rules to those containing affixes or space. Defaults to True. + max_cache_size (int): Maximum number of tokenization chunks to cache. EXAMPLE: >>> tokenizer = Tokenizer(nlp.vocab) @@ -69,6 +70,7 @@ cdef class Tokenizer: self._rules = {} self._special_matcher = PhraseMatcher(self.vocab) self._load_special_cases(rules) + self.max_cache_size = max_cache_size @property def token_match(self): @@ -397,8 +399,9 @@ cdef class Tokenizer: has_special, with_special_cases) self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special, with_special_cases) - self._save_cached(&tokens.c[orig_size], orig_key, has_special, - tokens.length - orig_size) + if len(self._cache) < self.max_cache_size: + self._save_cached(&tokens.c[orig_size], orig_key, has_special, + tokens.length - orig_size) cdef str _split_affixes( self, @@ -514,6 +517,9 @@ cdef class Tokenizer: if n <= 0: # avoid mem alloc of zero length return 0 + # Historically this check was mostly used to avoid caching + # chunks that had tokens owned by the Doc. Now that that's + # not a thing, I don't think we need this? for i in range(n): if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL: return 0 diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index 43e47af1d..c2bfe12e3 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -41,7 +41,9 @@ cdef class Vocab: cdef const TokenC* make_fused_token(self, substrings) except NULL cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL - cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 + cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1 cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL cdef PreshMap _by_orth + cdef Pool _non_temp_mem + cdef vector[attr_t] _transient_orths diff --git a/spacy/vocab.pyi b/spacy/vocab.pyi index b7ff20348..ee7636f02 100644 --- a/spacy/vocab.pyi +++ b/spacy/vocab.pyi @@ -1,6 +1,8 @@ +from contextlib import contextmanager from pathlib import Path from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Union +from cymem.cymem import Pool from thinc.types import Floats1d, FloatsXd from . import Language @@ -67,6 +69,8 @@ class Vocab: def from_bytes( self, bytes_data: bytes, *, exclude: Iterable[str] = ... ) -> Vocab: ... + @contextmanager + def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]: ... def pickle_vocab(vocab: Vocab) -> Any: ... def unpickle_vocab( diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 19e6eb005..97ba5d68c 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -1,4 +1,6 @@ import functools +from contextlib import ExitStack, contextmanager +from typing import Iterator, Optional import numpy import srsly @@ -87,6 +89,12 @@ cdef class Vocab: self.lookups = lookups self.writing_system = writing_system self.get_noun_chunks = get_noun_chunks + # During a memory_zone we replace our mem object with one + # that's passed to us. We keep a reference to our non-temporary + # memory here, in case we need to make an allocation we want to + # guarantee is not temporary. This is also how we check whether + # we're in a memory zone: we check whether self.mem is self._non_temp_mem + self._non_temp_mem = self.mem @property def vectors(self): @@ -114,6 +122,33 @@ cdef class Vocab: """ return self.length + @contextmanager + def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]: + """Begin a block where resources allocated during the block will + be freed at the end of it. If a resources was created within the + memory zone block, accessing it outside the block is invalid. + Behaviour of this invalid access is undefined. Memory zones should + not be nested. + + The memory zone is helpful for services that need to process large + volumes of text with a defined memory budget. + """ + if mem is None: + mem = Pool() + # The ExitStack allows programmatic nested context managers. + # We don't know how many we need, so it would be awkward to have + # them as nested blocks. + with ExitStack() as stack: + contexts = [stack.enter_context(self.strings.memory_zone(mem))] + if hasattr(self.morphology, "memory_zone"): + contexts.append(stack.enter_context(self.morphology.memory_zone(mem))) + if hasattr(self._vectors, "memory_zone"): + contexts.append(stack.enter_context(self._vectors.memory_zone(mem))) + self.mem = mem + yield mem + self._clear_transient_orths() + self.mem = self._non_temp_mem + def add_flag(self, flag_getter, int flag_id=-1): """Set a new boolean flag to words in the vocabulary. @@ -148,8 +183,7 @@ cdef class Vocab: cdef const LexemeC* get(self, Pool mem, str string) except NULL: """Get a pointer to a `LexemeC` from the lexicon, creating a new - `Lexeme` if necessary using memory acquired from the given pool. If the - pool is the lexicon's own memory, the lexeme is saved in the lexicon. + `Lexeme` if necessary. """ if string == "": return &EMPTY_LEXEME @@ -180,17 +214,9 @@ cdef class Vocab: return self._new_lexeme(mem, self.strings[orth]) cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL: - # I think this heuristic is bad, and the Vocab should always - # own the lexemes. It avoids weird bugs this way, as it's how the thing - # was originally supposed to work. The best solution to the growing - # memory use is to periodically reset the vocab, which is an action - # that should be up to the user to do (so we don't need to keep track - # of the doc ownership). - # TODO: Change the C API so that the mem isn't passed in here. + # The mem argument is deprecated, replaced by memory zones. Same with + # this size heuristic. mem = self.mem - # if len(string) < 3 or self.length < 10000: - # mem = self.mem - cdef bint is_oov = mem is not self.mem lex = mem.alloc(1, sizeof(LexemeC)) lex.orth = self.strings.add(string) lex.length = len(string) @@ -202,18 +228,25 @@ cdef class Vocab: for attr, func in self.lex_attr_getters.items(): value = func(string) if isinstance(value, str): - value = self.strings.add(value) + value = self.strings.add(value, allow_transient=True) if value is not None: Lexeme.set_struct_attr(lex, attr, value) - if not is_oov: - self._add_lex_to_vocab(lex.orth, lex) + self._add_lex_to_vocab(lex.orth, lex, self.mem is not self._non_temp_mem) if lex == NULL: raise ValueError(Errors.E085.format(string=string)) return lex - cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1: + cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1: self._by_orth.set(lex.orth, lex) self.length += 1 + if is_transient: + self._transient_orths.push_back(lex.orth) + + def _clear_transient_orths(self): + """Remove transient lexemes from the index (generally at the end of the memory zone)""" + for orth in self._transient_orths: + self._by_orth.pop(orth) + self._transient_orths.clear() def __contains__(self, key): """Check whether the string or int key has an entry in the vocabulary. @@ -265,7 +298,7 @@ cdef class Vocab: """ cdef attr_t orth if isinstance(id_or_string, str): - orth = self.strings.add(id_or_string) + orth = self.strings.add(id_or_string, allow_transient=True) else: orth = id_or_string return Lexeme(self, orth) @@ -417,7 +450,7 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab#get_vector """ if isinstance(orth, str): - orth = self.strings.add(orth) + orth = self.strings.add(orth, allow_transient=True) cdef Lexeme lex = self[orth] key = Lexeme.get_struct_attr(lex.c, self.vectors.attr) if self.has_vector(key): @@ -436,7 +469,7 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab#set_vector """ if isinstance(orth, str): - orth = self.strings.add(orth) + orth = self.strings.add(orth, allow_transient=False) cdef Lexeme lex = self[orth] key = Lexeme.get_struct_attr(lex.c, self.vectors.attr) if self.vectors.is_full and key not in self.vectors: @@ -460,7 +493,7 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab#has_vector """ if isinstance(orth, str): - orth = self.strings.add(orth) + orth = self.strings.add(orth, allow_transient=True) cdef Lexeme lex = self[orth] key = Lexeme.get_struct_attr(lex.c, self.vectors.attr) return key in self.vectors