Add get_string_id helper to spacy.strings

This commit is contained in:
Matthew Honnibal 2018-12-10 16:09:26 +01:00
parent cc1ea03004
commit 16fd8dce1d
1 changed files with 19 additions and 0 deletions

View File

@ -9,6 +9,7 @@ from libc.stdint cimport uint32_t
from murmurhash.mrmr cimport hash64, hash32 from murmurhash.mrmr cimport hash64, hash32
import srsly import srsly
from .compat import basestring_
from .symbols import IDS as SYMBOLS_BY_STR from .symbols import IDS as SYMBOLS_BY_STR
from .symbols import NAMES as SYMBOLS_BY_INT from .symbols import NAMES as SYMBOLS_BY_INT
from .typedefs cimport hash_t from .typedefs cimport hash_t
@ -16,6 +17,24 @@ from .errors import Errors
from . import util from . import util
def get_string_id(key):
"""Get a string ID, handling the reserved symbols correctly. If the key is
already an ID, return it.
This function optimises for convenience over performance, so shouldn't be
used in tight loops.
"""
if not isinstance(key, basestring_):
return key
elif key in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[key]
elif not key:
return 0
else:
chars = key.encode('utf8')
return hash_utf8(chars, len(chars))
cpdef hash_t hash_string(unicode string) except 0: cpdef hash_t hash_string(unicode string) except 0:
chars = string.encode('utf8') chars = string.encode('utf8')
return hash_utf8(chars, len(chars)) return hash_utf8(chars, len(chars))