From f2c8b1e3629ba23f1db62352868cb95374d42cdb Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 18 Sep 2019 20:24:41 +0200 Subject: [PATCH] Simplify lookup hashing Just use get_string_id, which already does everything ensure_hash was supposed to do --- spacy/lookups.py | 15 ++++----------- spacy/tests/vocab_vectors/test_lookups.py | 13 +++++++------ 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/spacy/lookups.py b/spacy/lookups.py index c1f351fe5..05a60f289 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -7,16 +7,9 @@ from preshed.bloom import BloomFilter from .errors import Errors from .util import SimpleFrozenDict, ensure_path -from .compat import basestring_ from .strings import get_string_id -def ensure_hash(key): - if isinstance(key, basestring_): - return get_string_id(key) - return key - - class Lookups(object): """Container for large lookup tables and dictionaries, e.g. lemmatization data or tokenizer exception lists. Lookups are available via vocab.lookups, @@ -202,7 +195,7 @@ class Table(OrderedDict): key (unicode / int): The key to set. value: The value to set. """ - key = ensure_hash(key) + key = get_string_id(key) OrderedDict.__setitem__(self, key, value) self.bloom.add(key) @@ -221,7 +214,7 @@ class Table(OrderedDict): key (unicode / int): The key to get. RETURNS: The value. """ - key = ensure_hash(key) + key = get_string_id(key) return OrderedDict.__getitem__(self, key) def get(self, key, default=None): @@ -231,7 +224,7 @@ class Table(OrderedDict): default: The default value to return. RETURNS: The value. """ - key = ensure_hash(key) + key = get_string_id(key) return OrderedDict.get(self, key, default) def __contains__(self, key): @@ -240,7 +233,7 @@ class Table(OrderedDict): key (unicode / int): The key to check. RETURNS (bool): Whether the key is in the table. """ - key = ensure_hash(key) + key = get_string_id(key) # This can give a false positive, so we need to check it after if key not in self.bloom: return False diff --git a/spacy/tests/vocab_vectors/test_lookups.py b/spacy/tests/vocab_vectors/test_lookups.py index 02f25532a..f78dd33c4 100644 --- a/spacy/tests/vocab_vectors/test_lookups.py +++ b/spacy/tests/vocab_vectors/test_lookups.py @@ -2,7 +2,8 @@ from __future__ import unicode_literals import pytest -from spacy.lookups import Lookups, Table, ensure_hash +from spacy.lookups import Lookups, Table +from spacy.strings import get_string_id from spacy.vocab import Vocab from ..util import make_tempdir @@ -45,17 +46,17 @@ def test_table_api(): table = Table(name="table", data=data) assert len(table) == len(data) assert "foo" in table - assert ensure_hash("foo") in table + assert get_string_id("foo") in table assert table["foo"] == "bar" - assert table[ensure_hash("foo")] == "bar" + assert table[get_string_id("foo")] == "bar" assert table.get("foo") == "bar" assert table.get("abc") is None table["abc"] = 123 assert table["abc"] == 123 - assert table[ensure_hash("abc")] == 123 + assert table[get_string_id("abc")] == 123 table.set("def", 456) assert table["def"] == 456 - assert table[ensure_hash("def")] == 456 + assert table[get_string_id("def")] == 456 def test_table_api_to_from_bytes(): @@ -66,7 +67,7 @@ def test_table_api_to_from_bytes(): assert new_table.name == "table" assert len(new_table) == 3 assert new_table["foo"] == "bar" - assert new_table[ensure_hash("foo")] == "bar" + assert new_table[get_string_id("foo")] == "bar" new_table2 = Table(data={"def": 456}) new_table2.from_bytes(table_bytes) assert len(new_table2) == 3