From ec6fbbfe3449b8e1520c85b13fcce0d8ee30da95 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 10 Dec 2024 18:37:04 +0100 Subject: [PATCH] Fix allocation of non-transient strings in StringStore --- spacy/morphology.pyx | 10 +++++++--- spacy/strings.pyx | 7 ++++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index cef45b04d..6f0cb03f0 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -57,16 +57,20 @@ cdef class Morphology: field_feature_pairs = [] for field in sorted(string_features): values = string_features[field] + self.strings.add(field, allow_transient=False), + field_id = self.strings[field] for value in values.split(self.VALUE_SEP): + field_sep_value = field + self.FIELD_SEP + value + self.strings.add(field_sep_value, allow_transient=False), field_feature_pairs.append(( - self.strings.add(field), - self.strings.add(field + self.FIELD_SEP + value), + field_id, + self.strings[field_sep_value] )) cdef MorphAnalysisC tag = self.create_morph_tag(field_feature_pairs) # the hash key for the tag is either the hash of the normalized UFEATS # string or the hash of an empty placeholder norm_feats_string = self.normalize_features(features) - tag.key = self.strings.add(norm_feats_string) + tag.key = self.strings.add(norm_feats_string, allow_transient=False) self.insert(tag) return tag.key diff --git a/spacy/strings.pyx b/spacy/strings.pyx index defb6d6f4..65e851cae 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -222,6 +222,8 @@ cdef class StringStore: internally should not. RETURNS (uint64): The string's hash value. """ + if not string: + return 0 if allow_transient is None: allow_transient = self.mem is not self._non_temp_mem cdef hash_t str_hash @@ -383,7 +385,10 @@ cdef class StringStore: cdef Utf8Str* value = self._map.get(key) if value is not NULL: return value - value = _allocate(self.mem, utf8_string, length) + if allow_transient: + value = _allocate(self.mem, utf8_string, length) + else: + value = _allocate(self._non_temp_mem, utf8_string, length) self._map.set(key, value) if allow_transient and self.mem is not self._non_temp_mem: self._transient_keys.push_back(key)