mirror of https://github.com/explosion/spaCy.git
Add get_string_id helper to spacy.strings
This commit is contained in:
parent
cc1ea03004
commit
16fd8dce1d
|
@ -9,6 +9,7 @@ from libc.stdint cimport uint32_t
|
||||||
from murmurhash.mrmr cimport hash64, hash32
|
from murmurhash.mrmr cimport hash64, hash32
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
|
from .compat import basestring_
|
||||||
from .symbols import IDS as SYMBOLS_BY_STR
|
from .symbols import IDS as SYMBOLS_BY_STR
|
||||||
from .symbols import NAMES as SYMBOLS_BY_INT
|
from .symbols import NAMES as SYMBOLS_BY_INT
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
|
@ -16,6 +17,24 @@ from .errors import Errors
|
||||||
from . import util
|
from . import util
|
||||||
|
|
||||||
|
|
||||||
|
def get_string_id(key):
|
||||||
|
"""Get a string ID, handling the reserved symbols correctly. If the key is
|
||||||
|
already an ID, return it.
|
||||||
|
|
||||||
|
This function optimises for convenience over performance, so shouldn't be
|
||||||
|
used in tight loops.
|
||||||
|
"""
|
||||||
|
if not isinstance(key, basestring_):
|
||||||
|
return key
|
||||||
|
elif key in SYMBOLS_BY_STR:
|
||||||
|
return SYMBOLS_BY_STR[key]
|
||||||
|
elif not key:
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
chars = key.encode('utf8')
|
||||||
|
return hash_utf8(chars, len(chars))
|
||||||
|
|
||||||
|
|
||||||
cpdef hash_t hash_string(unicode string) except 0:
|
cpdef hash_t hash_string(unicode string) except 0:
|
||||||
chars = string.encode('utf8')
|
chars = string.encode('utf8')
|
||||||
return hash_utf8(chars, len(chars))
|
return hash_utf8(chars, len(chars))
|
||||||
|
|
Loading…
Reference in New Issue