Simplify lookup hashing

Just use get_string_id, which already does everything ensure_hash was supposed to do
This commit is contained in:
Ines Montani 2019-09-18 20:24:41 +02:00
parent dd1810f05a
commit f2c8b1e362
2 changed files with 11 additions and 17 deletions

View File

@ -7,16 +7,9 @@ from preshed.bloom import BloomFilter
from .errors import Errors from .errors import Errors
from .util import SimpleFrozenDict, ensure_path from .util import SimpleFrozenDict, ensure_path
from .compat import basestring_
from .strings import get_string_id from .strings import get_string_id
def ensure_hash(key):
if isinstance(key, basestring_):
return get_string_id(key)
return key
class Lookups(object): class Lookups(object):
"""Container for large lookup tables and dictionaries, e.g. lemmatization """Container for large lookup tables and dictionaries, e.g. lemmatization
data or tokenizer exception lists. Lookups are available via vocab.lookups, data or tokenizer exception lists. Lookups are available via vocab.lookups,
@ -202,7 +195,7 @@ class Table(OrderedDict):
key (unicode / int): The key to set. key (unicode / int): The key to set.
value: The value to set. value: The value to set.
""" """
key = ensure_hash(key) key = get_string_id(key)
OrderedDict.__setitem__(self, key, value) OrderedDict.__setitem__(self, key, value)
self.bloom.add(key) self.bloom.add(key)
@ -221,7 +214,7 @@ class Table(OrderedDict):
key (unicode / int): The key to get. key (unicode / int): The key to get.
RETURNS: The value. RETURNS: The value.
""" """
key = ensure_hash(key) key = get_string_id(key)
return OrderedDict.__getitem__(self, key) return OrderedDict.__getitem__(self, key)
def get(self, key, default=None): def get(self, key, default=None):
@ -231,7 +224,7 @@ class Table(OrderedDict):
default: The default value to return. default: The default value to return.
RETURNS: The value. RETURNS: The value.
""" """
key = ensure_hash(key) key = get_string_id(key)
return OrderedDict.get(self, key, default) return OrderedDict.get(self, key, default)
def __contains__(self, key): def __contains__(self, key):
@ -240,7 +233,7 @@ class Table(OrderedDict):
key (unicode / int): The key to check. key (unicode / int): The key to check.
RETURNS (bool): Whether the key is in the table. RETURNS (bool): Whether the key is in the table.
""" """
key = ensure_hash(key) key = get_string_id(key)
# This can give a false positive, so we need to check it after # This can give a false positive, so we need to check it after
if key not in self.bloom: if key not in self.bloom:
return False return False

View File

@ -2,7 +2,8 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
from spacy.lookups import Lookups, Table, ensure_hash from spacy.lookups import Lookups, Table
from spacy.strings import get_string_id
from spacy.vocab import Vocab from spacy.vocab import Vocab
from ..util import make_tempdir from ..util import make_tempdir
@ -45,17 +46,17 @@ def test_table_api():
table = Table(name="table", data=data) table = Table(name="table", data=data)
assert len(table) == len(data) assert len(table) == len(data)
assert "foo" in table assert "foo" in table
assert ensure_hash("foo") in table assert get_string_id("foo") in table
assert table["foo"] == "bar" assert table["foo"] == "bar"
assert table[ensure_hash("foo")] == "bar" assert table[get_string_id("foo")] == "bar"
assert table.get("foo") == "bar" assert table.get("foo") == "bar"
assert table.get("abc") is None assert table.get("abc") is None
table["abc"] = 123 table["abc"] = 123
assert table["abc"] == 123 assert table["abc"] == 123
assert table[ensure_hash("abc")] == 123 assert table[get_string_id("abc")] == 123
table.set("def", 456) table.set("def", 456)
assert table["def"] == 456 assert table["def"] == 456
assert table[ensure_hash("def")] == 456 assert table[get_string_id("def")] == 456
def test_table_api_to_from_bytes(): def test_table_api_to_from_bytes():
@ -66,7 +67,7 @@ def test_table_api_to_from_bytes():
assert new_table.name == "table" assert new_table.name == "table"
assert len(new_table) == 3 assert len(new_table) == 3
assert new_table["foo"] == "bar" assert new_table["foo"] == "bar"
assert new_table[ensure_hash("foo")] == "bar" assert new_table[get_string_id("foo")] == "bar"
new_table2 = Table(data={"def": 456}) new_table2 = Table(data={"def": 456})
new_table2.from_bytes(table_bytes) new_table2.from_bytes(table_bytes)
assert len(new_table2) == 3 assert len(new_table2) == 3