From 16fd8dce1d208c7a42cc44cb08ca5cfd87db73d5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 10 Dec 2018 16:09:26 +0100 Subject: [PATCH] Add get_string_id helper to spacy.strings --- spacy/strings.pyx | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 2c8d5fcb4..26407ec59 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -9,6 +9,7 @@ from libc.stdint cimport uint32_t from murmurhash.mrmr cimport hash64, hash32 import srsly +from .compat import basestring_ from .symbols import IDS as SYMBOLS_BY_STR from .symbols import NAMES as SYMBOLS_BY_INT from .typedefs cimport hash_t @@ -16,6 +17,24 @@ from .errors import Errors from . import util +def get_string_id(key): + """Get a string ID, handling the reserved symbols correctly. If the key is + already an ID, return it. + + This function optimises for convenience over performance, so shouldn't be + used in tight loops. + """ + if not isinstance(key, basestring_): + return key + elif key in SYMBOLS_BY_STR: + return SYMBOLS_BY_STR[key] + elif not key: + return 0 + else: + chars = key.encode('utf8') + return hash_utf8(chars, len(chars)) + + cpdef hash_t hash_string(unicode string) except 0: chars = string.encode('utf8') return hash_utf8(chars, len(chars))