2020-07-22 11:42:59 +00:00
|
|
|
from typing import Dict, Any, List, Union, Optional
|
|
|
|
from pathlib import Path
|
2019-09-09 17:17:55 +00:00
|
|
|
import srsly
|
2019-09-15 20:08:13 +00:00
|
|
|
from preshed.bloom import BloomFilter
|
2019-12-22 00:53:56 +00:00
|
|
|
from collections import OrderedDict
|
2019-09-09 17:17:55 +00:00
|
|
|
|
|
|
|
from .errors import Errors
|
2020-07-22 13:59:37 +00:00
|
|
|
from .util import SimpleFrozenDict, ensure_path, registry, load_language_data
|
2019-09-15 20:08:13 +00:00
|
|
|
from .strings import get_string_id
|
2019-08-22 12:21:32 +00:00
|
|
|
|
Bloom-filter backed Lookup Tables (#4268)
* Improve load_language_data helper
* WIP: Add Lookups implementation
* Start moving lemma data over to JSON
* WIP: move data over for more languages
* Convert more languages
* Fix lemmatizer fixtures in tests
* Finish conversion
* Auto-format JSON files
* Fix test for now
* Make sure tables are stored on instance
* Update docstrings
* Update docstrings and errors
* Update test
* Add Lookups.__len__
* Add serialization methods
* Add Lookups.remove_table
* Use msgpack for serialization to disk
* Fix file exists check
* Try using OrderedDict for everything
* Update .flake8 [ci skip]
* Try fixing serialization
* Update test_lookups.py
* Update test_serialize_vocab_strings.py
* Lookups / Tables now work
This implements the stubs in the Lookups/Table classes. Currently this
is in Cython but with no type declarations, so that could be improved.
* Add lookups to setup.py
* Actually add lookups pyx
The previous commit added the old py file...
* Lookups work-in-progress
* Move from pyx back to py
* Add string based lookups, fix serialization
* Update tests, language/lemmatizer to work with string lookups
There are some outstanding issues here:
- a pickling-related test fails due to the bloom filter
- some custom lemmatizers (fr/nl at least) have issues
More generally, there's a question of how to deal with the case where
you have a string but want to use the lookup table. Currently the table
allows access by string or id, but that's getting pretty awkward.
* Change lemmatizer lookup method to pass (orth, string)
* Fix token lookup
* Fix French lookup
* Fix lt lemmatizer test
* Fix Dutch lemmatizer
* Fix lemmatizer lookup test
This was using a normal dict instead of a Table, so checks for the
string instead of an integer key failed.
* Make uk/nl/ru lemmatizer lookup methods consistent
The mentioned tokenizers all have their own implementation of the
`lookup` method, which accesses a `Lookups` table. The way that was
called in `token.pyx` was changed so this should be updated to have the
same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id,
string)).
Prior to this change tests weren't failing, but there would probably be
issues with normal use of a model. More tests should proably be added.
Additionally, the language-specific `lookup` implementations seem like
they might not be needed, since they handle things like lower-casing
that aren't actually language specific.
* Make recently added Greek method compatible
* Remove redundant class/method
Leftovers from a merge not cleaned up adequately.
2019-09-12 15:26:11 +00:00
|
|
|
|
2019-10-01 19:36:04 +00:00
|
|
|
UNSET = object()
|
|
|
|
|
|
|
|
|
2020-07-22 11:42:59 +00:00
|
|
|
@registry.language_data("spacy-lookups-data")
|
2020-07-22 13:59:37 +00:00
|
|
|
def get_lookups(lang: str, tables: List[str]) -> Optional[Dict[str, Any]]:
|
2020-07-22 11:42:59 +00:00
|
|
|
"""Load the data from the spacy-lookups-data package for a given language,
|
|
|
|
if available. Returns an empty dict if there's no data or if the package
|
|
|
|
is not installed.
|
|
|
|
|
|
|
|
lang (str): The language code (corresponds to entry point exposed by
|
|
|
|
the spacy-lookups-data package).
|
2020-07-22 13:59:37 +00:00
|
|
|
tables (List[str]): Name of tables to load, e.g. ["lemma_lookup", "lemma_exc"]
|
2020-07-22 11:42:59 +00:00
|
|
|
RETURNS (Dict[str, Any]): The lookups, keyed by table name.
|
|
|
|
"""
|
2020-07-22 13:59:37 +00:00
|
|
|
# TODO: import spacy_lookups_data instead of going via entry points here?
|
|
|
|
if lang not in registry.lookups:
|
|
|
|
return {}
|
|
|
|
data = registry.lookups.get(lang)
|
|
|
|
result = {}
|
|
|
|
for table in tables:
|
|
|
|
if table not in data:
|
|
|
|
raise ValueError("TODO: unknown table")
|
|
|
|
result[table] = load_language_data(data[table])
|
|
|
|
return result
|
2020-07-22 11:42:59 +00:00
|
|
|
|
|
|
|
|
2020-07-12 12:03:23 +00:00
|
|
|
class Lookups:
|
2019-09-09 17:17:55 +00:00
|
|
|
"""Container for large lookup tables and dictionaries, e.g. lemmatization
|
|
|
|
data or tokenizer exception lists. Lookups are available via vocab.lookups,
|
|
|
|
so they can be accessed before the pipeline components are applied (e.g.
|
|
|
|
in the tokenizer and lemmatizer), as well as within the pipeline components
|
|
|
|
via doc.vocab.lookups.
|
|
|
|
"""
|
|
|
|
|
2020-07-22 11:42:59 +00:00
|
|
|
def __init__(self) -> None:
|
2019-09-09 17:17:55 +00:00
|
|
|
"""Initialize the Lookups object.
|
|
|
|
|
|
|
|
RETURNS (Lookups): The newly created object.
|
2019-09-18 17:57:21 +00:00
|
|
|
|
|
|
|
DOCS: https://spacy.io/api/lookups#init
|
2019-09-09 17:17:55 +00:00
|
|
|
"""
|
2019-12-22 00:53:56 +00:00
|
|
|
self._tables = {}
|
2019-08-22 12:21:32 +00:00
|
|
|
|
2020-07-22 11:42:59 +00:00
|
|
|
def __contains__(self, name: str) -> bool:
|
2019-09-09 17:17:55 +00:00
|
|
|
"""Check if the lookups contain a table of a given name. Delegates to
|
|
|
|
Lookups.has_table.
|
|
|
|
|
2020-05-24 15:20:58 +00:00
|
|
|
name (str): Name of the table.
|
2019-09-12 12:00:14 +00:00
|
|
|
RETURNS (bool): Whether a table of that name is in the lookups.
|
2019-09-09 17:17:55 +00:00
|
|
|
"""
|
2019-08-22 12:21:32 +00:00
|
|
|
return self.has_table(name)
|
|
|
|
|
2020-07-22 11:42:59 +00:00
|
|
|
def __len__(self) -> int:
|
2019-09-09 17:17:55 +00:00
|
|
|
"""RETURNS (int): The number of tables in the lookups."""
|
|
|
|
return len(self._tables)
|
|
|
|
|
2019-08-22 12:21:32 +00:00
|
|
|
@property
|
2020-07-22 11:42:59 +00:00
|
|
|
def tables(self) -> List[str]:
|
|
|
|
"""RETURNS (List[str]): Names of all tables in the lookups."""
|
2019-08-22 12:21:32 +00:00
|
|
|
return list(self._tables.keys())
|
|
|
|
|
2020-07-22 11:42:59 +00:00
|
|
|
def add_table(self, name: str, data: dict = SimpleFrozenDict()) -> "Table":
|
2019-09-09 17:17:55 +00:00
|
|
|
"""Add a new table to the lookups. Raises an error if the table exists.
|
|
|
|
|
2020-05-24 15:20:58 +00:00
|
|
|
name (str): Unique name of table.
|
2019-09-09 17:17:55 +00:00
|
|
|
data (dict): Optional data to add to the table.
|
|
|
|
RETURNS (Table): The newly added table.
|
2019-09-18 17:57:21 +00:00
|
|
|
|
|
|
|
DOCS: https://spacy.io/api/lookups#add_table
|
2019-09-09 17:17:55 +00:00
|
|
|
"""
|
2019-08-22 12:21:32 +00:00
|
|
|
if name in self.tables:
|
2019-09-09 17:17:55 +00:00
|
|
|
raise ValueError(Errors.E158.format(name=name))
|
Bloom-filter backed Lookup Tables (#4268)
* Improve load_language_data helper
* WIP: Add Lookups implementation
* Start moving lemma data over to JSON
* WIP: move data over for more languages
* Convert more languages
* Fix lemmatizer fixtures in tests
* Finish conversion
* Auto-format JSON files
* Fix test for now
* Make sure tables are stored on instance
* Update docstrings
* Update docstrings and errors
* Update test
* Add Lookups.__len__
* Add serialization methods
* Add Lookups.remove_table
* Use msgpack for serialization to disk
* Fix file exists check
* Try using OrderedDict for everything
* Update .flake8 [ci skip]
* Try fixing serialization
* Update test_lookups.py
* Update test_serialize_vocab_strings.py
* Lookups / Tables now work
This implements the stubs in the Lookups/Table classes. Currently this
is in Cython but with no type declarations, so that could be improved.
* Add lookups to setup.py
* Actually add lookups pyx
The previous commit added the old py file...
* Lookups work-in-progress
* Move from pyx back to py
* Add string based lookups, fix serialization
* Update tests, language/lemmatizer to work with string lookups
There are some outstanding issues here:
- a pickling-related test fails due to the bloom filter
- some custom lemmatizers (fr/nl at least) have issues
More generally, there's a question of how to deal with the case where
you have a string but want to use the lookup table. Currently the table
allows access by string or id, but that's getting pretty awkward.
* Change lemmatizer lookup method to pass (orth, string)
* Fix token lookup
* Fix French lookup
* Fix lt lemmatizer test
* Fix Dutch lemmatizer
* Fix lemmatizer lookup test
This was using a normal dict instead of a Table, so checks for the
string instead of an integer key failed.
* Make uk/nl/ru lemmatizer lookup methods consistent
The mentioned tokenizers all have their own implementation of the
`lookup` method, which accesses a `Lookups` table. The way that was
called in `token.pyx` was changed so this should be updated to have the
same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id,
string)).
Prior to this change tests weren't failing, but there would probably be
issues with normal use of a model. More tests should proably be added.
Additionally, the language-specific `lookup` implementations seem like
they might not be needed, since they handle things like lower-casing
that aren't actually language specific.
* Make recently added Greek method compatible
* Remove redundant class/method
Leftovers from a merge not cleaned up adequately.
2019-09-12 15:26:11 +00:00
|
|
|
table = Table(name=name, data=data)
|
2019-08-22 12:21:32 +00:00
|
|
|
self._tables[name] = table
|
|
|
|
return table
|
|
|
|
|
2020-07-22 11:42:59 +00:00
|
|
|
def get_table(self, name: str, default: Any = UNSET) -> "Table":
|
2019-10-01 19:36:04 +00:00
|
|
|
"""Get a table. Raises an error if the table doesn't exist and no
|
|
|
|
default value is provided.
|
2019-09-09 17:17:55 +00:00
|
|
|
|
2020-05-24 15:20:58 +00:00
|
|
|
name (str): Name of the table.
|
2020-07-22 11:42:59 +00:00
|
|
|
default (Any): Optional default value to return if table doesn't exist.
|
2019-09-09 17:17:55 +00:00
|
|
|
RETURNS (Table): The table.
|
2019-09-18 17:57:21 +00:00
|
|
|
|
|
|
|
DOCS: https://spacy.io/api/lookups#get_table
|
2019-09-09 17:17:55 +00:00
|
|
|
"""
|
2019-08-22 12:21:32 +00:00
|
|
|
if name not in self._tables:
|
2019-10-01 19:36:04 +00:00
|
|
|
if default == UNSET:
|
|
|
|
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
|
|
|
|
return default
|
2019-08-22 12:21:32 +00:00
|
|
|
return self._tables[name]
|
|
|
|
|
2020-07-22 11:42:59 +00:00
|
|
|
def remove_table(self, name: str) -> "Table":
|
2019-09-09 17:17:55 +00:00
|
|
|
"""Remove a table. Raises an error if the table doesn't exist.
|
|
|
|
|
2020-05-24 15:20:58 +00:00
|
|
|
name (str): Name of the table to remove.
|
2019-09-09 17:17:55 +00:00
|
|
|
RETURNS (Table): The removed table.
|
2019-09-18 17:57:21 +00:00
|
|
|
|
|
|
|
DOCS: https://spacy.io/api/lookups#remove_table
|
2019-09-09 17:17:55 +00:00
|
|
|
"""
|
|
|
|
if name not in self._tables:
|
|
|
|
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
|
|
|
|
return self._tables.pop(name)
|
|
|
|
|
2020-07-22 11:42:59 +00:00
|
|
|
def has_table(self, name: str) -> bool:
|
2019-09-09 17:17:55 +00:00
|
|
|
"""Check if the lookups contain a table of a given name.
|
|
|
|
|
2020-05-24 15:20:58 +00:00
|
|
|
name (str): Name of the table.
|
2019-09-09 17:17:55 +00:00
|
|
|
RETURNS (bool): Whether a table of that name exists.
|
2019-09-18 17:57:21 +00:00
|
|
|
|
|
|
|
DOCS: https://spacy.io/api/lookups#has_table
|
2019-09-09 17:17:55 +00:00
|
|
|
"""
|
2019-08-22 12:21:32 +00:00
|
|
|
return name in self._tables
|
|
|
|
|
2020-07-22 11:42:59 +00:00
|
|
|
def to_bytes(self, **kwargs) -> bytes:
|
2019-09-09 17:17:55 +00:00
|
|
|
"""Serialize the lookups to a bytestring.
|
|
|
|
|
|
|
|
RETURNS (bytes): The serialized Lookups.
|
2019-09-18 17:57:21 +00:00
|
|
|
|
|
|
|
DOCS: https://spacy.io/api/lookups#to_bytes
|
2019-09-09 17:17:55 +00:00
|
|
|
"""
|
|
|
|
return srsly.msgpack_dumps(self._tables)
|
2019-08-22 12:21:32 +00:00
|
|
|
|
2020-07-22 11:42:59 +00:00
|
|
|
def from_bytes(self, bytes_data: bytes, **kwargs) -> "Lookups":
|
2019-09-09 17:17:55 +00:00
|
|
|
"""Load the lookups from a bytestring.
|
|
|
|
|
2019-09-12 12:00:14 +00:00
|
|
|
bytes_data (bytes): The data to load.
|
|
|
|
RETURNS (Lookups): The loaded Lookups.
|
2019-09-18 17:57:21 +00:00
|
|
|
|
|
|
|
DOCS: https://spacy.io/api/lookups#from_bytes
|
2019-09-09 17:17:55 +00:00
|
|
|
"""
|
2019-12-22 00:53:56 +00:00
|
|
|
self._tables = {}
|
Bloom-filter backed Lookup Tables (#4268)
* Improve load_language_data helper
* WIP: Add Lookups implementation
* Start moving lemma data over to JSON
* WIP: move data over for more languages
* Convert more languages
* Fix lemmatizer fixtures in tests
* Finish conversion
* Auto-format JSON files
* Fix test for now
* Make sure tables are stored on instance
* Update docstrings
* Update docstrings and errors
* Update test
* Add Lookups.__len__
* Add serialization methods
* Add Lookups.remove_table
* Use msgpack for serialization to disk
* Fix file exists check
* Try using OrderedDict for everything
* Update .flake8 [ci skip]
* Try fixing serialization
* Update test_lookups.py
* Update test_serialize_vocab_strings.py
* Lookups / Tables now work
This implements the stubs in the Lookups/Table classes. Currently this
is in Cython but with no type declarations, so that could be improved.
* Add lookups to setup.py
* Actually add lookups pyx
The previous commit added the old py file...
* Lookups work-in-progress
* Move from pyx back to py
* Add string based lookups, fix serialization
* Update tests, language/lemmatizer to work with string lookups
There are some outstanding issues here:
- a pickling-related test fails due to the bloom filter
- some custom lemmatizers (fr/nl at least) have issues
More generally, there's a question of how to deal with the case where
you have a string but want to use the lookup table. Currently the table
allows access by string or id, but that's getting pretty awkward.
* Change lemmatizer lookup method to pass (orth, string)
* Fix token lookup
* Fix French lookup
* Fix lt lemmatizer test
* Fix Dutch lemmatizer
* Fix lemmatizer lookup test
This was using a normal dict instead of a Table, so checks for the
string instead of an integer key failed.
* Make uk/nl/ru lemmatizer lookup methods consistent
The mentioned tokenizers all have their own implementation of the
`lookup` method, which accesses a `Lookups` table. The way that was
called in `token.pyx` was changed so this should be updated to have the
same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id,
string)).
Prior to this change tests weren't failing, but there would probably be
issues with normal use of a model. More tests should proably be added.
Additionally, the language-specific `lookup` implementations seem like
they might not be needed, since they handle things like lower-casing
that aren't actually language specific.
* Make recently added Greek method compatible
* Remove redundant class/method
Leftovers from a merge not cleaned up adequately.
2019-09-12 15:26:11 +00:00
|
|
|
for key, value in srsly.msgpack_loads(bytes_data).items():
|
2020-06-26 12:09:10 +00:00
|
|
|
self._tables[key] = Table(key, value)
|
2019-09-09 17:17:55 +00:00
|
|
|
return self
|
2019-08-22 12:21:32 +00:00
|
|
|
|
2020-07-22 11:42:59 +00:00
|
|
|
def to_disk(
|
|
|
|
self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
|
|
|
|
) -> None:
|
2019-09-12 12:00:14 +00:00
|
|
|
"""Save the lookups to a directory as lookups.bin. Expects a path to a
|
|
|
|
directory, which will be created if it doesn't exist.
|
2019-08-22 12:21:32 +00:00
|
|
|
|
2020-05-24 15:20:58 +00:00
|
|
|
path (str / Path): The file path.
|
2019-09-18 17:57:21 +00:00
|
|
|
|
|
|
|
DOCS: https://spacy.io/api/lookups#to_disk
|
2019-09-09 17:17:55 +00:00
|
|
|
"""
|
|
|
|
if len(self._tables):
|
|
|
|
path = ensure_path(path)
|
2019-09-12 12:00:01 +00:00
|
|
|
if not path.exists():
|
|
|
|
path.mkdir()
|
2020-05-19 13:59:14 +00:00
|
|
|
filepath = path / filename
|
2019-09-09 17:17:55 +00:00
|
|
|
with filepath.open("wb") as file_:
|
|
|
|
file_.write(self.to_bytes())
|
2019-08-22 12:21:32 +00:00
|
|
|
|
2020-07-22 11:42:59 +00:00
|
|
|
def from_disk(
|
|
|
|
self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
|
|
|
|
) -> "Lookups":
|
2019-09-12 12:00:14 +00:00
|
|
|
"""Load lookups from a directory containing a lookups.bin. Will skip
|
|
|
|
loading if the file doesn't exist.
|
2019-09-09 17:17:55 +00:00
|
|
|
|
2020-05-24 15:20:58 +00:00
|
|
|
path (str / Path): The directory path.
|
2019-09-09 17:17:55 +00:00
|
|
|
RETURNS (Lookups): The loaded lookups.
|
2019-09-18 17:57:21 +00:00
|
|
|
|
|
|
|
DOCS: https://spacy.io/api/lookups#from_disk
|
2019-09-09 17:17:55 +00:00
|
|
|
"""
|
|
|
|
path = ensure_path(path)
|
2020-05-19 13:59:14 +00:00
|
|
|
filepath = path / filename
|
2019-09-09 17:17:55 +00:00
|
|
|
if filepath.exists():
|
|
|
|
with filepath.open("rb") as file_:
|
|
|
|
data = file_.read()
|
|
|
|
return self.from_bytes(data)
|
|
|
|
return self
|
|
|
|
|
|
|
|
|
|
|
|
class Table(OrderedDict):
|
Bloom-filter backed Lookup Tables (#4268)
* Improve load_language_data helper
* WIP: Add Lookups implementation
* Start moving lemma data over to JSON
* WIP: move data over for more languages
* Convert more languages
* Fix lemmatizer fixtures in tests
* Finish conversion
* Auto-format JSON files
* Fix test for now
* Make sure tables are stored on instance
* Update docstrings
* Update docstrings and errors
* Update test
* Add Lookups.__len__
* Add serialization methods
* Add Lookups.remove_table
* Use msgpack for serialization to disk
* Fix file exists check
* Try using OrderedDict for everything
* Update .flake8 [ci skip]
* Try fixing serialization
* Update test_lookups.py
* Update test_serialize_vocab_strings.py
* Lookups / Tables now work
This implements the stubs in the Lookups/Table classes. Currently this
is in Cython but with no type declarations, so that could be improved.
* Add lookups to setup.py
* Actually add lookups pyx
The previous commit added the old py file...
* Lookups work-in-progress
* Move from pyx back to py
* Add string based lookups, fix serialization
* Update tests, language/lemmatizer to work with string lookups
There are some outstanding issues here:
- a pickling-related test fails due to the bloom filter
- some custom lemmatizers (fr/nl at least) have issues
More generally, there's a question of how to deal with the case where
you have a string but want to use the lookup table. Currently the table
allows access by string or id, but that's getting pretty awkward.
* Change lemmatizer lookup method to pass (orth, string)
* Fix token lookup
* Fix French lookup
* Fix lt lemmatizer test
* Fix Dutch lemmatizer
* Fix lemmatizer lookup test
This was using a normal dict instead of a Table, so checks for the
string instead of an integer key failed.
* Make uk/nl/ru lemmatizer lookup methods consistent
The mentioned tokenizers all have their own implementation of the
`lookup` method, which accesses a `Lookups` table. The way that was
called in `token.pyx` was changed so this should be updated to have the
same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id,
string)).
Prior to this change tests weren't failing, but there would probably be
issues with normal use of a model. More tests should proably be added.
Additionally, the language-specific `lookup` implementations seem like
they might not be needed, since they handle things like lower-casing
that aren't actually language specific.
* Make recently added Greek method compatible
* Remove redundant class/method
Leftovers from a merge not cleaned up adequately.
2019-09-12 15:26:11 +00:00
|
|
|
"""A table in the lookups. Subclass of builtin dict that implements a
|
2019-09-15 20:08:13 +00:00
|
|
|
slightly more consistent and unified API.
|
Bloom-filter backed Lookup Tables (#4268)
* Improve load_language_data helper
* WIP: Add Lookups implementation
* Start moving lemma data over to JSON
* WIP: move data over for more languages
* Convert more languages
* Fix lemmatizer fixtures in tests
* Finish conversion
* Auto-format JSON files
* Fix test for now
* Make sure tables are stored on instance
* Update docstrings
* Update docstrings and errors
* Update test
* Add Lookups.__len__
* Add serialization methods
* Add Lookups.remove_table
* Use msgpack for serialization to disk
* Fix file exists check
* Try using OrderedDict for everything
* Update .flake8 [ci skip]
* Try fixing serialization
* Update test_lookups.py
* Update test_serialize_vocab_strings.py
* Lookups / Tables now work
This implements the stubs in the Lookups/Table classes. Currently this
is in Cython but with no type declarations, so that could be improved.
* Add lookups to setup.py
* Actually add lookups pyx
The previous commit added the old py file...
* Lookups work-in-progress
* Move from pyx back to py
* Add string based lookups, fix serialization
* Update tests, language/lemmatizer to work with string lookups
There are some outstanding issues here:
- a pickling-related test fails due to the bloom filter
- some custom lemmatizers (fr/nl at least) have issues
More generally, there's a question of how to deal with the case where
you have a string but want to use the lookup table. Currently the table
allows access by string or id, but that's getting pretty awkward.
* Change lemmatizer lookup method to pass (orth, string)
* Fix token lookup
* Fix French lookup
* Fix lt lemmatizer test
* Fix Dutch lemmatizer
* Fix lemmatizer lookup test
This was using a normal dict instead of a Table, so checks for the
string instead of an integer key failed.
* Make uk/nl/ru lemmatizer lookup methods consistent
The mentioned tokenizers all have their own implementation of the
`lookup` method, which accesses a `Lookups` table. The way that was
called in `token.pyx` was changed so this should be updated to have the
same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id,
string)).
Prior to this change tests weren't failing, but there would probably be
issues with normal use of a model. More tests should proably be added.
Additionally, the language-specific `lookup` implementations seem like
they might not be needed, since they handle things like lower-casing
that aren't actually language specific.
* Make recently added Greek method compatible
* Remove redundant class/method
Leftovers from a merge not cleaned up adequately.
2019-09-12 15:26:11 +00:00
|
|
|
|
|
|
|
Includes a Bloom filter to speed up missed lookups.
|
2019-09-09 17:17:55 +00:00
|
|
|
"""
|
2019-09-11 12:00:36 +00:00
|
|
|
|
2019-09-09 17:17:55 +00:00
|
|
|
@classmethod
|
2020-07-22 11:42:59 +00:00
|
|
|
def from_dict(cls, data: dict, name: Optional[str] = None) -> "Table":
|
2019-09-12 12:00:14 +00:00
|
|
|
"""Initialize a new table from a dict.
|
|
|
|
|
|
|
|
data (dict): The dictionary.
|
2020-05-24 15:20:58 +00:00
|
|
|
name (str): Optional table name for reference.
|
2019-09-12 12:00:14 +00:00
|
|
|
RETURNS (Table): The newly created object.
|
2019-09-18 17:57:21 +00:00
|
|
|
|
|
|
|
DOCS: https://spacy.io/api/lookups#table.from_dict
|
2019-09-12 12:00:14 +00:00
|
|
|
"""
|
2019-09-09 17:17:55 +00:00
|
|
|
self = cls(name=name)
|
|
|
|
self.update(data)
|
|
|
|
return self
|
2019-08-22 12:21:32 +00:00
|
|
|
|
2020-07-22 11:42:59 +00:00
|
|
|
def __init__(self, name: Optional[str] = None, data: Optional[dict] = None) -> None:
|
2019-09-09 17:17:55 +00:00
|
|
|
"""Initialize a new table.
|
|
|
|
|
2020-05-24 15:20:58 +00:00
|
|
|
name (str): Optional table name for reference.
|
Bloom-filter backed Lookup Tables (#4268)
* Improve load_language_data helper
* WIP: Add Lookups implementation
* Start moving lemma data over to JSON
* WIP: move data over for more languages
* Convert more languages
* Fix lemmatizer fixtures in tests
* Finish conversion
* Auto-format JSON files
* Fix test for now
* Make sure tables are stored on instance
* Update docstrings
* Update docstrings and errors
* Update test
* Add Lookups.__len__
* Add serialization methods
* Add Lookups.remove_table
* Use msgpack for serialization to disk
* Fix file exists check
* Try using OrderedDict for everything
* Update .flake8 [ci skip]
* Try fixing serialization
* Update test_lookups.py
* Update test_serialize_vocab_strings.py
* Lookups / Tables now work
This implements the stubs in the Lookups/Table classes. Currently this
is in Cython but with no type declarations, so that could be improved.
* Add lookups to setup.py
* Actually add lookups pyx
The previous commit added the old py file...
* Lookups work-in-progress
* Move from pyx back to py
* Add string based lookups, fix serialization
* Update tests, language/lemmatizer to work with string lookups
There are some outstanding issues here:
- a pickling-related test fails due to the bloom filter
- some custom lemmatizers (fr/nl at least) have issues
More generally, there's a question of how to deal with the case where
you have a string but want to use the lookup table. Currently the table
allows access by string or id, but that's getting pretty awkward.
* Change lemmatizer lookup method to pass (orth, string)
* Fix token lookup
* Fix French lookup
* Fix lt lemmatizer test
* Fix Dutch lemmatizer
* Fix lemmatizer lookup test
This was using a normal dict instead of a Table, so checks for the
string instead of an integer key failed.
* Make uk/nl/ru lemmatizer lookup methods consistent
The mentioned tokenizers all have their own implementation of the
`lookup` method, which accesses a `Lookups` table. The way that was
called in `token.pyx` was changed so this should be updated to have the
same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id,
string)).
Prior to this change tests weren't failing, but there would probably be
issues with normal use of a model. More tests should proably be added.
Additionally, the language-specific `lookup` implementations seem like
they might not be needed, since they handle things like lower-casing
that aren't actually language specific.
* Make recently added Greek method compatible
* Remove redundant class/method
Leftovers from a merge not cleaned up adequately.
2019-09-12 15:26:11 +00:00
|
|
|
data (dict): Initial data, used to hint Bloom Filter.
|
2019-09-09 17:17:55 +00:00
|
|
|
RETURNS (Table): The newly created object.
|
2019-09-18 17:57:21 +00:00
|
|
|
|
|
|
|
DOCS: https://spacy.io/api/lookups#table.init
|
2019-09-09 17:17:55 +00:00
|
|
|
"""
|
|
|
|
OrderedDict.__init__(self)
|
2019-08-22 12:21:32 +00:00
|
|
|
self.name = name
|
2019-09-15 20:08:13 +00:00
|
|
|
# Assume a default size of 1M items
|
|
|
|
self.default_size = 1e6
|
2020-06-26 12:09:10 +00:00
|
|
|
size = max(len(data), 1) if data is not None else self.default_size
|
Bloom-filter backed Lookup Tables (#4268)
* Improve load_language_data helper
* WIP: Add Lookups implementation
* Start moving lemma data over to JSON
* WIP: move data over for more languages
* Convert more languages
* Fix lemmatizer fixtures in tests
* Finish conversion
* Auto-format JSON files
* Fix test for now
* Make sure tables are stored on instance
* Update docstrings
* Update docstrings and errors
* Update test
* Add Lookups.__len__
* Add serialization methods
* Add Lookups.remove_table
* Use msgpack for serialization to disk
* Fix file exists check
* Try using OrderedDict for everything
* Update .flake8 [ci skip]
* Try fixing serialization
* Update test_lookups.py
* Update test_serialize_vocab_strings.py
* Lookups / Tables now work
This implements the stubs in the Lookups/Table classes. Currently this
is in Cython but with no type declarations, so that could be improved.
* Add lookups to setup.py
* Actually add lookups pyx
The previous commit added the old py file...
* Lookups work-in-progress
* Move from pyx back to py
* Add string based lookups, fix serialization
* Update tests, language/lemmatizer to work with string lookups
There are some outstanding issues here:
- a pickling-related test fails due to the bloom filter
- some custom lemmatizers (fr/nl at least) have issues
More generally, there's a question of how to deal with the case where
you have a string but want to use the lookup table. Currently the table
allows access by string or id, but that's getting pretty awkward.
* Change lemmatizer lookup method to pass (orth, string)
* Fix token lookup
* Fix French lookup
* Fix lt lemmatizer test
* Fix Dutch lemmatizer
* Fix lemmatizer lookup test
This was using a normal dict instead of a Table, so checks for the
string instead of an integer key failed.
* Make uk/nl/ru lemmatizer lookup methods consistent
The mentioned tokenizers all have their own implementation of the
`lookup` method, which accesses a `Lookups` table. The way that was
called in `token.pyx` was changed so this should be updated to have the
same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id,
string)).
Prior to this change tests weren't failing, but there would probably be
issues with normal use of a model. More tests should proably be added.
Additionally, the language-specific `lookup` implementations seem like
they might not be needed, since they handle things like lower-casing
that aren't actually language specific.
* Make recently added Greek method compatible
* Remove redundant class/method
Leftovers from a merge not cleaned up adequately.
2019-09-12 15:26:11 +00:00
|
|
|
self.bloom = BloomFilter.from_error_rate(size)
|
|
|
|
if data:
|
|
|
|
self.update(data)
|
|
|
|
|
2020-07-22 11:42:59 +00:00
|
|
|
def __setitem__(self, key: Union[str, int], value: Any) -> None:
|
2019-09-15 20:08:13 +00:00
|
|
|
"""Set new key/value pair. String keys will be hashed.
|
|
|
|
|
2020-05-24 15:20:58 +00:00
|
|
|
key (str / int): The key to set.
|
2019-09-15 20:08:13 +00:00
|
|
|
value: The value to set.
|
|
|
|
"""
|
2019-09-18 18:24:41 +00:00
|
|
|
key = get_string_id(key)
|
Bloom-filter backed Lookup Tables (#4268)
* Improve load_language_data helper
* WIP: Add Lookups implementation
* Start moving lemma data over to JSON
* WIP: move data over for more languages
* Convert more languages
* Fix lemmatizer fixtures in tests
* Finish conversion
* Auto-format JSON files
* Fix test for now
* Make sure tables are stored on instance
* Update docstrings
* Update docstrings and errors
* Update test
* Add Lookups.__len__
* Add serialization methods
* Add Lookups.remove_table
* Use msgpack for serialization to disk
* Fix file exists check
* Try using OrderedDict for everything
* Update .flake8 [ci skip]
* Try fixing serialization
* Update test_lookups.py
* Update test_serialize_vocab_strings.py
* Lookups / Tables now work
This implements the stubs in the Lookups/Table classes. Currently this
is in Cython but with no type declarations, so that could be improved.
* Add lookups to setup.py
* Actually add lookups pyx
The previous commit added the old py file...
* Lookups work-in-progress
* Move from pyx back to py
* Add string based lookups, fix serialization
* Update tests, language/lemmatizer to work with string lookups
There are some outstanding issues here:
- a pickling-related test fails due to the bloom filter
- some custom lemmatizers (fr/nl at least) have issues
More generally, there's a question of how to deal with the case where
you have a string but want to use the lookup table. Currently the table
allows access by string or id, but that's getting pretty awkward.
* Change lemmatizer lookup method to pass (orth, string)
* Fix token lookup
* Fix French lookup
* Fix lt lemmatizer test
* Fix Dutch lemmatizer
* Fix lemmatizer lookup test
This was using a normal dict instead of a Table, so checks for the
string instead of an integer key failed.
* Make uk/nl/ru lemmatizer lookup methods consistent
The mentioned tokenizers all have their own implementation of the
`lookup` method, which accesses a `Lookups` table. The way that was
called in `token.pyx` was changed so this should be updated to have the
same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id,
string)).
Prior to this change tests weren't failing, but there would probably be
issues with normal use of a model. More tests should proably be added.
Additionally, the language-specific `lookup` implementations seem like
they might not be needed, since they handle things like lower-casing
that aren't actually language specific.
* Make recently added Greek method compatible
* Remove redundant class/method
Leftovers from a merge not cleaned up adequately.
2019-09-12 15:26:11 +00:00
|
|
|
OrderedDict.__setitem__(self, key, value)
|
|
|
|
self.bloom.add(key)
|
|
|
|
|
2020-07-22 11:42:59 +00:00
|
|
|
def set(self, key: Union[str, int], value: Any) -> None:
|
2019-09-15 20:08:13 +00:00
|
|
|
"""Set new key/value pair. String keys will be hashed.
|
|
|
|
Same as table[key] = value.
|
Bloom-filter backed Lookup Tables (#4268)
* Improve load_language_data helper
* WIP: Add Lookups implementation
* Start moving lemma data over to JSON
* WIP: move data over for more languages
* Convert more languages
* Fix lemmatizer fixtures in tests
* Finish conversion
* Auto-format JSON files
* Fix test for now
* Make sure tables are stored on instance
* Update docstrings
* Update docstrings and errors
* Update test
* Add Lookups.__len__
* Add serialization methods
* Add Lookups.remove_table
* Use msgpack for serialization to disk
* Fix file exists check
* Try using OrderedDict for everything
* Update .flake8 [ci skip]
* Try fixing serialization
* Update test_lookups.py
* Update test_serialize_vocab_strings.py
* Lookups / Tables now work
This implements the stubs in the Lookups/Table classes. Currently this
is in Cython but with no type declarations, so that could be improved.
* Add lookups to setup.py
* Actually add lookups pyx
The previous commit added the old py file...
* Lookups work-in-progress
* Move from pyx back to py
* Add string based lookups, fix serialization
* Update tests, language/lemmatizer to work with string lookups
There are some outstanding issues here:
- a pickling-related test fails due to the bloom filter
- some custom lemmatizers (fr/nl at least) have issues
More generally, there's a question of how to deal with the case where
you have a string but want to use the lookup table. Currently the table
allows access by string or id, but that's getting pretty awkward.
* Change lemmatizer lookup method to pass (orth, string)
* Fix token lookup
* Fix French lookup
* Fix lt lemmatizer test
* Fix Dutch lemmatizer
* Fix lemmatizer lookup test
This was using a normal dict instead of a Table, so checks for the
string instead of an integer key failed.
* Make uk/nl/ru lemmatizer lookup methods consistent
The mentioned tokenizers all have their own implementation of the
`lookup` method, which accesses a `Lookups` table. The way that was
called in `token.pyx` was changed so this should be updated to have the
same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id,
string)).
Prior to this change tests weren't failing, but there would probably be
issues with normal use of a model. More tests should proably be added.
Additionally, the language-specific `lookup` implementations seem like
they might not be needed, since they handle things like lower-casing
that aren't actually language specific.
* Make recently added Greek method compatible
* Remove redundant class/method
Leftovers from a merge not cleaned up adequately.
2019-09-12 15:26:11 +00:00
|
|
|
|
2020-05-24 15:20:58 +00:00
|
|
|
key (str / int): The key to set.
|
2019-09-15 20:08:13 +00:00
|
|
|
value: The value to set.
|
Bloom-filter backed Lookup Tables (#4268)
* Improve load_language_data helper
* WIP: Add Lookups implementation
* Start moving lemma data over to JSON
* WIP: move data over for more languages
* Convert more languages
* Fix lemmatizer fixtures in tests
* Finish conversion
* Auto-format JSON files
* Fix test for now
* Make sure tables are stored on instance
* Update docstrings
* Update docstrings and errors
* Update test
* Add Lookups.__len__
* Add serialization methods
* Add Lookups.remove_table
* Use msgpack for serialization to disk
* Fix file exists check
* Try using OrderedDict for everything
* Update .flake8 [ci skip]
* Try fixing serialization
* Update test_lookups.py
* Update test_serialize_vocab_strings.py
* Lookups / Tables now work
This implements the stubs in the Lookups/Table classes. Currently this
is in Cython but with no type declarations, so that could be improved.
* Add lookups to setup.py
* Actually add lookups pyx
The previous commit added the old py file...
* Lookups work-in-progress
* Move from pyx back to py
* Add string based lookups, fix serialization
* Update tests, language/lemmatizer to work with string lookups
There are some outstanding issues here:
- a pickling-related test fails due to the bloom filter
- some custom lemmatizers (fr/nl at least) have issues
More generally, there's a question of how to deal with the case where
you have a string but want to use the lookup table. Currently the table
allows access by string or id, but that's getting pretty awkward.
* Change lemmatizer lookup method to pass (orth, string)
* Fix token lookup
* Fix French lookup
* Fix lt lemmatizer test
* Fix Dutch lemmatizer
* Fix lemmatizer lookup test
This was using a normal dict instead of a Table, so checks for the
string instead of an integer key failed.
* Make uk/nl/ru lemmatizer lookup methods consistent
The mentioned tokenizers all have their own implementation of the
`lookup` method, which accesses a `Lookups` table. The way that was
called in `token.pyx` was changed so this should be updated to have the
same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id,
string)).
Prior to this change tests weren't failing, but there would probably be
issues with normal use of a model. More tests should proably be added.
Additionally, the language-specific `lookup` implementations seem like
they might not be needed, since they handle things like lower-casing
that aren't actually language specific.
* Make recently added Greek method compatible
* Remove redundant class/method
Leftovers from a merge not cleaned up adequately.
2019-09-12 15:26:11 +00:00
|
|
|
"""
|
2019-09-15 20:08:13 +00:00
|
|
|
self[key] = value
|
Bloom-filter backed Lookup Tables (#4268)
* Improve load_language_data helper
* WIP: Add Lookups implementation
* Start moving lemma data over to JSON
* WIP: move data over for more languages
* Convert more languages
* Fix lemmatizer fixtures in tests
* Finish conversion
* Auto-format JSON files
* Fix test for now
* Make sure tables are stored on instance
* Update docstrings
* Update docstrings and errors
* Update test
* Add Lookups.__len__
* Add serialization methods
* Add Lookups.remove_table
* Use msgpack for serialization to disk
* Fix file exists check
* Try using OrderedDict for everything
* Update .flake8 [ci skip]
* Try fixing serialization
* Update test_lookups.py
* Update test_serialize_vocab_strings.py
* Lookups / Tables now work
This implements the stubs in the Lookups/Table classes. Currently this
is in Cython but with no type declarations, so that could be improved.
* Add lookups to setup.py
* Actually add lookups pyx
The previous commit added the old py file...
* Lookups work-in-progress
* Move from pyx back to py
* Add string based lookups, fix serialization
* Update tests, language/lemmatizer to work with string lookups
There are some outstanding issues here:
- a pickling-related test fails due to the bloom filter
- some custom lemmatizers (fr/nl at least) have issues
More generally, there's a question of how to deal with the case where
you have a string but want to use the lookup table. Currently the table
allows access by string or id, but that's getting pretty awkward.
* Change lemmatizer lookup method to pass (orth, string)
* Fix token lookup
* Fix French lookup
* Fix lt lemmatizer test
* Fix Dutch lemmatizer
* Fix lemmatizer lookup test
This was using a normal dict instead of a Table, so checks for the
string instead of an integer key failed.
* Make uk/nl/ru lemmatizer lookup methods consistent
The mentioned tokenizers all have their own implementation of the
`lookup` method, which accesses a `Lookups` table. The way that was
called in `token.pyx` was changed so this should be updated to have the
same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id,
string)).
Prior to this change tests weren't failing, but there would probably be
issues with normal use of a model. More tests should proably be added.
Additionally, the language-specific `lookup` implementations seem like
they might not be needed, since they handle things like lower-casing
that aren't actually language specific.
* Make recently added Greek method compatible
* Remove redundant class/method
Leftovers from a merge not cleaned up adequately.
2019-09-12 15:26:11 +00:00
|
|
|
|
2020-07-22 11:42:59 +00:00
|
|
|
def __getitem__(self, key: Union[str, int]) -> Any:
|
2019-09-15 20:08:13 +00:00
|
|
|
"""Get the value for a given key. String keys will be hashed.
|
|
|
|
|
2020-05-24 15:20:58 +00:00
|
|
|
key (str / int): The key to get.
|
2019-09-15 20:08:13 +00:00
|
|
|
RETURNS: The value.
|
Bloom-filter backed Lookup Tables (#4268)
* Improve load_language_data helper
* WIP: Add Lookups implementation
* Start moving lemma data over to JSON
* WIP: move data over for more languages
* Convert more languages
* Fix lemmatizer fixtures in tests
* Finish conversion
* Auto-format JSON files
* Fix test for now
* Make sure tables are stored on instance
* Update docstrings
* Update docstrings and errors
* Update test
* Add Lookups.__len__
* Add serialization methods
* Add Lookups.remove_table
* Use msgpack for serialization to disk
* Fix file exists check
* Try using OrderedDict for everything
* Update .flake8 [ci skip]
* Try fixing serialization
* Update test_lookups.py
* Update test_serialize_vocab_strings.py
* Lookups / Tables now work
This implements the stubs in the Lookups/Table classes. Currently this
is in Cython but with no type declarations, so that could be improved.
* Add lookups to setup.py
* Actually add lookups pyx
The previous commit added the old py file...
* Lookups work-in-progress
* Move from pyx back to py
* Add string based lookups, fix serialization
* Update tests, language/lemmatizer to work with string lookups
There are some outstanding issues here:
- a pickling-related test fails due to the bloom filter
- some custom lemmatizers (fr/nl at least) have issues
More generally, there's a question of how to deal with the case where
you have a string but want to use the lookup table. Currently the table
allows access by string or id, but that's getting pretty awkward.
* Change lemmatizer lookup method to pass (orth, string)
* Fix token lookup
* Fix French lookup
* Fix lt lemmatizer test
* Fix Dutch lemmatizer
* Fix lemmatizer lookup test
This was using a normal dict instead of a Table, so checks for the
string instead of an integer key failed.
* Make uk/nl/ru lemmatizer lookup methods consistent
The mentioned tokenizers all have their own implementation of the
`lookup` method, which accesses a `Lookups` table. The way that was
called in `token.pyx` was changed so this should be updated to have the
same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id,
string)).
Prior to this change tests weren't failing, but there would probably be
issues with normal use of a model. More tests should proably be added.
Additionally, the language-specific `lookup` implementations seem like
they might not be needed, since they handle things like lower-casing
that aren't actually language specific.
* Make recently added Greek method compatible
* Remove redundant class/method
Leftovers from a merge not cleaned up adequately.
2019-09-12 15:26:11 +00:00
|
|
|
"""
|
2019-09-18 18:24:41 +00:00
|
|
|
key = get_string_id(key)
|
2019-09-15 20:08:13 +00:00
|
|
|
return OrderedDict.__getitem__(self, key)
|
Bloom-filter backed Lookup Tables (#4268)
* Improve load_language_data helper
* WIP: Add Lookups implementation
* Start moving lemma data over to JSON
* WIP: move data over for more languages
* Convert more languages
* Fix lemmatizer fixtures in tests
* Finish conversion
* Auto-format JSON files
* Fix test for now
* Make sure tables are stored on instance
* Update docstrings
* Update docstrings and errors
* Update test
* Add Lookups.__len__
* Add serialization methods
* Add Lookups.remove_table
* Use msgpack for serialization to disk
* Fix file exists check
* Try using OrderedDict for everything
* Update .flake8 [ci skip]
* Try fixing serialization
* Update test_lookups.py
* Update test_serialize_vocab_strings.py
* Lookups / Tables now work
This implements the stubs in the Lookups/Table classes. Currently this
is in Cython but with no type declarations, so that could be improved.
* Add lookups to setup.py
* Actually add lookups pyx
The previous commit added the old py file...
* Lookups work-in-progress
* Move from pyx back to py
* Add string based lookups, fix serialization
* Update tests, language/lemmatizer to work with string lookups
There are some outstanding issues here:
- a pickling-related test fails due to the bloom filter
- some custom lemmatizers (fr/nl at least) have issues
More generally, there's a question of how to deal with the case where
you have a string but want to use the lookup table. Currently the table
allows access by string or id, but that's getting pretty awkward.
* Change lemmatizer lookup method to pass (orth, string)
* Fix token lookup
* Fix French lookup
* Fix lt lemmatizer test
* Fix Dutch lemmatizer
* Fix lemmatizer lookup test
This was using a normal dict instead of a Table, so checks for the
string instead of an integer key failed.
* Make uk/nl/ru lemmatizer lookup methods consistent
The mentioned tokenizers all have their own implementation of the
`lookup` method, which accesses a `Lookups` table. The way that was
called in `token.pyx` was changed so this should be updated to have the
same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id,
string)).
Prior to this change tests weren't failing, but there would probably be
issues with normal use of a model. More tests should proably be added.
Additionally, the language-specific `lookup` implementations seem like
they might not be needed, since they handle things like lower-casing
that aren't actually language specific.
* Make recently added Greek method compatible
* Remove redundant class/method
Leftovers from a merge not cleaned up adequately.
2019-09-12 15:26:11 +00:00
|
|
|
|
2020-07-22 11:42:59 +00:00
|
|
|
def get(self, key: Union[str, int], default: Optional[Any] = None) -> Any:
|
2019-09-15 20:08:13 +00:00
|
|
|
"""Get the value for a given key. String keys will be hashed.
|
Bloom-filter backed Lookup Tables (#4268)
* Improve load_language_data helper
* WIP: Add Lookups implementation
* Start moving lemma data over to JSON
* WIP: move data over for more languages
* Convert more languages
* Fix lemmatizer fixtures in tests
* Finish conversion
* Auto-format JSON files
* Fix test for now
* Make sure tables are stored on instance
* Update docstrings
* Update docstrings and errors
* Update test
* Add Lookups.__len__
* Add serialization methods
* Add Lookups.remove_table
* Use msgpack for serialization to disk
* Fix file exists check
* Try using OrderedDict for everything
* Update .flake8 [ci skip]
* Try fixing serialization
* Update test_lookups.py
* Update test_serialize_vocab_strings.py
* Lookups / Tables now work
This implements the stubs in the Lookups/Table classes. Currently this
is in Cython but with no type declarations, so that could be improved.
* Add lookups to setup.py
* Actually add lookups pyx
The previous commit added the old py file...
* Lookups work-in-progress
* Move from pyx back to py
* Add string based lookups, fix serialization
* Update tests, language/lemmatizer to work with string lookups
There are some outstanding issues here:
- a pickling-related test fails due to the bloom filter
- some custom lemmatizers (fr/nl at least) have issues
More generally, there's a question of how to deal with the case where
you have a string but want to use the lookup table. Currently the table
allows access by string or id, but that's getting pretty awkward.
* Change lemmatizer lookup method to pass (orth, string)
* Fix token lookup
* Fix French lookup
* Fix lt lemmatizer test
* Fix Dutch lemmatizer
* Fix lemmatizer lookup test
This was using a normal dict instead of a Table, so checks for the
string instead of an integer key failed.
* Make uk/nl/ru lemmatizer lookup methods consistent
The mentioned tokenizers all have their own implementation of the
`lookup` method, which accesses a `Lookups` table. The way that was
called in `token.pyx` was changed so this should be updated to have the
same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id,
string)).
Prior to this change tests weren't failing, but there would probably be
issues with normal use of a model. More tests should proably be added.
Additionally, the language-specific `lookup` implementations seem like
they might not be needed, since they handle things like lower-casing
that aren't actually language specific.
* Make recently added Greek method compatible
* Remove redundant class/method
Leftovers from a merge not cleaned up adequately.
2019-09-12 15:26:11 +00:00
|
|
|
|
2020-05-24 15:20:58 +00:00
|
|
|
key (str / int): The key to get.
|
2019-09-15 20:08:13 +00:00
|
|
|
default: The default value to return.
|
|
|
|
RETURNS: The value.
|
|
|
|
"""
|
2019-09-18 18:24:41 +00:00
|
|
|
key = get_string_id(key)
|
2019-09-15 20:08:13 +00:00
|
|
|
return OrderedDict.get(self, key, default)
|
Bloom-filter backed Lookup Tables (#4268)
* Improve load_language_data helper
* WIP: Add Lookups implementation
* Start moving lemma data over to JSON
* WIP: move data over for more languages
* Convert more languages
* Fix lemmatizer fixtures in tests
* Finish conversion
* Auto-format JSON files
* Fix test for now
* Make sure tables are stored on instance
* Update docstrings
* Update docstrings and errors
* Update test
* Add Lookups.__len__
* Add serialization methods
* Add Lookups.remove_table
* Use msgpack for serialization to disk
* Fix file exists check
* Try using OrderedDict for everything
* Update .flake8 [ci skip]
* Try fixing serialization
* Update test_lookups.py
* Update test_serialize_vocab_strings.py
* Lookups / Tables now work
This implements the stubs in the Lookups/Table classes. Currently this
is in Cython but with no type declarations, so that could be improved.
* Add lookups to setup.py
* Actually add lookups pyx
The previous commit added the old py file...
* Lookups work-in-progress
* Move from pyx back to py
* Add string based lookups, fix serialization
* Update tests, language/lemmatizer to work with string lookups
There are some outstanding issues here:
- a pickling-related test fails due to the bloom filter
- some custom lemmatizers (fr/nl at least) have issues
More generally, there's a question of how to deal with the case where
you have a string but want to use the lookup table. Currently the table
allows access by string or id, but that's getting pretty awkward.
* Change lemmatizer lookup method to pass (orth, string)
* Fix token lookup
* Fix French lookup
* Fix lt lemmatizer test
* Fix Dutch lemmatizer
* Fix lemmatizer lookup test
This was using a normal dict instead of a Table, so checks for the
string instead of an integer key failed.
* Make uk/nl/ru lemmatizer lookup methods consistent
The mentioned tokenizers all have their own implementation of the
`lookup` method, which accesses a `Lookups` table. The way that was
called in `token.pyx` was changed so this should be updated to have the
same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id,
string)).
Prior to this change tests weren't failing, but there would probably be
issues with normal use of a model. More tests should proably be added.
Additionally, the language-specific `lookup` implementations seem like
they might not be needed, since they handle things like lower-casing
that aren't actually language specific.
* Make recently added Greek method compatible
* Remove redundant class/method
Leftovers from a merge not cleaned up adequately.
2019-09-12 15:26:11 +00:00
|
|
|
|
2020-07-22 11:42:59 +00:00
|
|
|
def __contains__(self, key: Union[str, int]) -> bool:
|
2019-09-15 20:08:13 +00:00
|
|
|
"""Check whether a key is in the table. String keys will be hashed.
|
|
|
|
|
2020-05-24 15:20:58 +00:00
|
|
|
key (str / int): The key to check.
|
2019-09-15 20:08:13 +00:00
|
|
|
RETURNS (bool): Whether the key is in the table.
|
|
|
|
"""
|
2019-09-18 18:24:41 +00:00
|
|
|
key = get_string_id(key)
|
Bloom-filter backed Lookup Tables (#4268)
* Improve load_language_data helper
* WIP: Add Lookups implementation
* Start moving lemma data over to JSON
* WIP: move data over for more languages
* Convert more languages
* Fix lemmatizer fixtures in tests
* Finish conversion
* Auto-format JSON files
* Fix test for now
* Make sure tables are stored on instance
* Update docstrings
* Update docstrings and errors
* Update test
* Add Lookups.__len__
* Add serialization methods
* Add Lookups.remove_table
* Use msgpack for serialization to disk
* Fix file exists check
* Try using OrderedDict for everything
* Update .flake8 [ci skip]
* Try fixing serialization
* Update test_lookups.py
* Update test_serialize_vocab_strings.py
* Lookups / Tables now work
This implements the stubs in the Lookups/Table classes. Currently this
is in Cython but with no type declarations, so that could be improved.
* Add lookups to setup.py
* Actually add lookups pyx
The previous commit added the old py file...
* Lookups work-in-progress
* Move from pyx back to py
* Add string based lookups, fix serialization
* Update tests, language/lemmatizer to work with string lookups
There are some outstanding issues here:
- a pickling-related test fails due to the bloom filter
- some custom lemmatizers (fr/nl at least) have issues
More generally, there's a question of how to deal with the case where
you have a string but want to use the lookup table. Currently the table
allows access by string or id, but that's getting pretty awkward.
* Change lemmatizer lookup method to pass (orth, string)
* Fix token lookup
* Fix French lookup
* Fix lt lemmatizer test
* Fix Dutch lemmatizer
* Fix lemmatizer lookup test
This was using a normal dict instead of a Table, so checks for the
string instead of an integer key failed.
* Make uk/nl/ru lemmatizer lookup methods consistent
The mentioned tokenizers all have their own implementation of the
`lookup` method, which accesses a `Lookups` table. The way that was
called in `token.pyx` was changed so this should be updated to have the
same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id,
string)).
Prior to this change tests weren't failing, but there would probably be
issues with normal use of a model. More tests should proably be added.
Additionally, the language-specific `lookup` implementations seem like
they might not be needed, since they handle things like lower-casing
that aren't actually language specific.
* Make recently added Greek method compatible
* Remove redundant class/method
Leftovers from a merge not cleaned up adequately.
2019-09-12 15:26:11 +00:00
|
|
|
# This can give a false positive, so we need to check it after
|
2019-09-15 20:08:13 +00:00
|
|
|
if key not in self.bloom:
|
Bloom-filter backed Lookup Tables (#4268)
* Improve load_language_data helper
* WIP: Add Lookups implementation
* Start moving lemma data over to JSON
* WIP: move data over for more languages
* Convert more languages
* Fix lemmatizer fixtures in tests
* Finish conversion
* Auto-format JSON files
* Fix test for now
* Make sure tables are stored on instance
* Update docstrings
* Update docstrings and errors
* Update test
* Add Lookups.__len__
* Add serialization methods
* Add Lookups.remove_table
* Use msgpack for serialization to disk
* Fix file exists check
* Try using OrderedDict for everything
* Update .flake8 [ci skip]
* Try fixing serialization
* Update test_lookups.py
* Update test_serialize_vocab_strings.py
* Lookups / Tables now work
This implements the stubs in the Lookups/Table classes. Currently this
is in Cython but with no type declarations, so that could be improved.
* Add lookups to setup.py
* Actually add lookups pyx
The previous commit added the old py file...
* Lookups work-in-progress
* Move from pyx back to py
* Add string based lookups, fix serialization
* Update tests, language/lemmatizer to work with string lookups
There are some outstanding issues here:
- a pickling-related test fails due to the bloom filter
- some custom lemmatizers (fr/nl at least) have issues
More generally, there's a question of how to deal with the case where
you have a string but want to use the lookup table. Currently the table
allows access by string or id, but that's getting pretty awkward.
* Change lemmatizer lookup method to pass (orth, string)
* Fix token lookup
* Fix French lookup
* Fix lt lemmatizer test
* Fix Dutch lemmatizer
* Fix lemmatizer lookup test
This was using a normal dict instead of a Table, so checks for the
string instead of an integer key failed.
* Make uk/nl/ru lemmatizer lookup methods consistent
The mentioned tokenizers all have their own implementation of the
`lookup` method, which accesses a `Lookups` table. The way that was
called in `token.pyx` was changed so this should be updated to have the
same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id,
string)).
Prior to this change tests weren't failing, but there would probably be
issues with normal use of a model. More tests should proably be added.
Additionally, the language-specific `lookup` implementations seem like
they might not be needed, since they handle things like lower-casing
that aren't actually language specific.
* Make recently added Greek method compatible
* Remove redundant class/method
Leftovers from a merge not cleaned up adequately.
2019-09-12 15:26:11 +00:00
|
|
|
return False
|
|
|
|
return OrderedDict.__contains__(self, key)
|
|
|
|
|
2020-07-22 11:42:59 +00:00
|
|
|
def to_bytes(self) -> bytes:
|
2019-09-15 20:08:13 +00:00
|
|
|
"""Serialize table to a bytestring.
|
Bloom-filter backed Lookup Tables (#4268)
* Improve load_language_data helper
* WIP: Add Lookups implementation
* Start moving lemma data over to JSON
* WIP: move data over for more languages
* Convert more languages
* Fix lemmatizer fixtures in tests
* Finish conversion
* Auto-format JSON files
* Fix test for now
* Make sure tables are stored on instance
* Update docstrings
* Update docstrings and errors
* Update test
* Add Lookups.__len__
* Add serialization methods
* Add Lookups.remove_table
* Use msgpack for serialization to disk
* Fix file exists check
* Try using OrderedDict for everything
* Update .flake8 [ci skip]
* Try fixing serialization
* Update test_lookups.py
* Update test_serialize_vocab_strings.py
* Lookups / Tables now work
This implements the stubs in the Lookups/Table classes. Currently this
is in Cython but with no type declarations, so that could be improved.
* Add lookups to setup.py
* Actually add lookups pyx
The previous commit added the old py file...
* Lookups work-in-progress
* Move from pyx back to py
* Add string based lookups, fix serialization
* Update tests, language/lemmatizer to work with string lookups
There are some outstanding issues here:
- a pickling-related test fails due to the bloom filter
- some custom lemmatizers (fr/nl at least) have issues
More generally, there's a question of how to deal with the case where
you have a string but want to use the lookup table. Currently the table
allows access by string or id, but that's getting pretty awkward.
* Change lemmatizer lookup method to pass (orth, string)
* Fix token lookup
* Fix French lookup
* Fix lt lemmatizer test
* Fix Dutch lemmatizer
* Fix lemmatizer lookup test
This was using a normal dict instead of a Table, so checks for the
string instead of an integer key failed.
* Make uk/nl/ru lemmatizer lookup methods consistent
The mentioned tokenizers all have their own implementation of the
`lookup` method, which accesses a `Lookups` table. The way that was
called in `token.pyx` was changed so this should be updated to have the
same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id,
string)).
Prior to this change tests weren't failing, but there would probably be
issues with normal use of a model. More tests should proably be added.
Additionally, the language-specific `lookup` implementations seem like
they might not be needed, since they handle things like lower-casing
that aren't actually language specific.
* Make recently added Greek method compatible
* Remove redundant class/method
Leftovers from a merge not cleaned up adequately.
2019-09-12 15:26:11 +00:00
|
|
|
|
2019-09-15 20:08:13 +00:00
|
|
|
RETURNS (bytes): The serialized table.
|
2019-09-18 17:57:21 +00:00
|
|
|
|
|
|
|
DOCS: https://spacy.io/api/lookups#table.to_bytes
|
2019-09-15 20:08:13 +00:00
|
|
|
"""
|
2019-12-22 00:53:56 +00:00
|
|
|
data = {
|
|
|
|
"name": self.name,
|
|
|
|
"dict": dict(self.items()),
|
|
|
|
"bloom": self.bloom.to_bytes(),
|
|
|
|
}
|
|
|
|
return srsly.msgpack_dumps(data)
|
Bloom-filter backed Lookup Tables (#4268)
* Improve load_language_data helper
* WIP: Add Lookups implementation
* Start moving lemma data over to JSON
* WIP: move data over for more languages
* Convert more languages
* Fix lemmatizer fixtures in tests
* Finish conversion
* Auto-format JSON files
* Fix test for now
* Make sure tables are stored on instance
* Update docstrings
* Update docstrings and errors
* Update test
* Add Lookups.__len__
* Add serialization methods
* Add Lookups.remove_table
* Use msgpack for serialization to disk
* Fix file exists check
* Try using OrderedDict for everything
* Update .flake8 [ci skip]
* Try fixing serialization
* Update test_lookups.py
* Update test_serialize_vocab_strings.py
* Lookups / Tables now work
This implements the stubs in the Lookups/Table classes. Currently this
is in Cython but with no type declarations, so that could be improved.
* Add lookups to setup.py
* Actually add lookups pyx
The previous commit added the old py file...
* Lookups work-in-progress
* Move from pyx back to py
* Add string based lookups, fix serialization
* Update tests, language/lemmatizer to work with string lookups
There are some outstanding issues here:
- a pickling-related test fails due to the bloom filter
- some custom lemmatizers (fr/nl at least) have issues
More generally, there's a question of how to deal with the case where
you have a string but want to use the lookup table. Currently the table
allows access by string or id, but that's getting pretty awkward.
* Change lemmatizer lookup method to pass (orth, string)
* Fix token lookup
* Fix French lookup
* Fix lt lemmatizer test
* Fix Dutch lemmatizer
* Fix lemmatizer lookup test
This was using a normal dict instead of a Table, so checks for the
string instead of an integer key failed.
* Make uk/nl/ru lemmatizer lookup methods consistent
The mentioned tokenizers all have their own implementation of the
`lookup` method, which accesses a `Lookups` table. The way that was
called in `token.pyx` was changed so this should be updated to have the
same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id,
string)).
Prior to this change tests weren't failing, but there would probably be
issues with normal use of a model. More tests should proably be added.
Additionally, the language-specific `lookup` implementations seem like
they might not be needed, since they handle things like lower-casing
that aren't actually language specific.
* Make recently added Greek method compatible
* Remove redundant class/method
Leftovers from a merge not cleaned up adequately.
2019-09-12 15:26:11 +00:00
|
|
|
|
2020-07-22 11:42:59 +00:00
|
|
|
def from_bytes(self, bytes_data: bytes) -> "Table":
|
2019-09-15 20:08:13 +00:00
|
|
|
"""Load a table from a bytestring.
|
Bloom-filter backed Lookup Tables (#4268)
* Improve load_language_data helper
* WIP: Add Lookups implementation
* Start moving lemma data over to JSON
* WIP: move data over for more languages
* Convert more languages
* Fix lemmatizer fixtures in tests
* Finish conversion
* Auto-format JSON files
* Fix test for now
* Make sure tables are stored on instance
* Update docstrings
* Update docstrings and errors
* Update test
* Add Lookups.__len__
* Add serialization methods
* Add Lookups.remove_table
* Use msgpack for serialization to disk
* Fix file exists check
* Try using OrderedDict for everything
* Update .flake8 [ci skip]
* Try fixing serialization
* Update test_lookups.py
* Update test_serialize_vocab_strings.py
* Lookups / Tables now work
This implements the stubs in the Lookups/Table classes. Currently this
is in Cython but with no type declarations, so that could be improved.
* Add lookups to setup.py
* Actually add lookups pyx
The previous commit added the old py file...
* Lookups work-in-progress
* Move from pyx back to py
* Add string based lookups, fix serialization
* Update tests, language/lemmatizer to work with string lookups
There are some outstanding issues here:
- a pickling-related test fails due to the bloom filter
- some custom lemmatizers (fr/nl at least) have issues
More generally, there's a question of how to deal with the case where
you have a string but want to use the lookup table. Currently the table
allows access by string or id, but that's getting pretty awkward.
* Change lemmatizer lookup method to pass (orth, string)
* Fix token lookup
* Fix French lookup
* Fix lt lemmatizer test
* Fix Dutch lemmatizer
* Fix lemmatizer lookup test
This was using a normal dict instead of a Table, so checks for the
string instead of an integer key failed.
* Make uk/nl/ru lemmatizer lookup methods consistent
The mentioned tokenizers all have their own implementation of the
`lookup` method, which accesses a `Lookups` table. The way that was
called in `token.pyx` was changed so this should be updated to have the
same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id,
string)).
Prior to this change tests weren't failing, but there would probably be
issues with normal use of a model. More tests should proably be added.
Additionally, the language-specific `lookup` implementations seem like
they might not be needed, since they handle things like lower-casing
that aren't actually language specific.
* Make recently added Greek method compatible
* Remove redundant class/method
Leftovers from a merge not cleaned up adequately.
2019-09-12 15:26:11 +00:00
|
|
|
|
2019-09-15 20:08:13 +00:00
|
|
|
bytes_data (bytes): The data to load.
|
|
|
|
RETURNS (Table): The loaded table.
|
2019-09-18 17:57:21 +00:00
|
|
|
|
|
|
|
DOCS: https://spacy.io/api/lookups#table.from_bytes
|
2019-09-15 20:08:13 +00:00
|
|
|
"""
|
|
|
|
loaded = srsly.msgpack_loads(bytes_data)
|
|
|
|
data = loaded.get("dict", {})
|
|
|
|
self.name = loaded["name"]
|
|
|
|
self.bloom = BloomFilter().from_bytes(loaded["bloom"])
|
|
|
|
self.clear()
|
|
|
|
self.update(data)
|
|
|
|
return self
|