Fix allocation of non-transient strings in StringStore

This commit is contained in:
Matthew Honnibal 2024-12-10 18:37:04 +01:00
parent 628c973db5
commit ec6fbbfe34
2 changed files with 13 additions and 4 deletions

View File

@ -57,16 +57,20 @@ cdef class Morphology:
field_feature_pairs = [] field_feature_pairs = []
for field in sorted(string_features): for field in sorted(string_features):
values = string_features[field] values = string_features[field]
self.strings.add(field, allow_transient=False),
field_id = self.strings[field]
for value in values.split(self.VALUE_SEP): for value in values.split(self.VALUE_SEP):
field_sep_value = field + self.FIELD_SEP + value
self.strings.add(field_sep_value, allow_transient=False),
field_feature_pairs.append(( field_feature_pairs.append((
self.strings.add(field), field_id,
self.strings.add(field + self.FIELD_SEP + value), self.strings[field_sep_value]
)) ))
cdef MorphAnalysisC tag = self.create_morph_tag(field_feature_pairs) cdef MorphAnalysisC tag = self.create_morph_tag(field_feature_pairs)
# the hash key for the tag is either the hash of the normalized UFEATS # the hash key for the tag is either the hash of the normalized UFEATS
# string or the hash of an empty placeholder # string or the hash of an empty placeholder
norm_feats_string = self.normalize_features(features) norm_feats_string = self.normalize_features(features)
tag.key = self.strings.add(norm_feats_string) tag.key = self.strings.add(norm_feats_string, allow_transient=False)
self.insert(tag) self.insert(tag)
return tag.key return tag.key

View File

@ -222,6 +222,8 @@ cdef class StringStore:
internally should not. internally should not.
RETURNS (uint64): The string's hash value. RETURNS (uint64): The string's hash value.
""" """
if not string:
return 0
if allow_transient is None: if allow_transient is None:
allow_transient = self.mem is not self._non_temp_mem allow_transient = self.mem is not self._non_temp_mem
cdef hash_t str_hash cdef hash_t str_hash
@ -383,7 +385,10 @@ cdef class StringStore:
cdef Utf8Str* value = <Utf8Str*>self._map.get(key) cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
if value is not NULL: if value is not NULL:
return value return value
if allow_transient:
value = _allocate(self.mem, <unsigned char*>utf8_string, length) value = _allocate(self.mem, <unsigned char*>utf8_string, length)
else:
value = _allocate(self._non_temp_mem, <unsigned char*>utf8_string, length)
self._map.set(key, value) self._map.set(key, value)
if allow_transient and self.mem is not self._non_temp_mem: if allow_transient and self.mem is not self._non_temp_mem:
self._transient_keys.push_back(key) self._transient_keys.push_back(key)