spaCy/spacy/tests/vocab_vectors/test_lookups.py

149 lines
4.8 KiB
Python
Raw Normal View History

# coding: utf-8
from __future__ import unicode_literals
import pytest
from spacy.lookups import Lookups, Table
from spacy.strings import get_string_id
from spacy.vocab import Vocab
from ..util import make_tempdir
def test_lookups_api():
table_name = "test"
data = {"foo": "bar", "hello": "world"}
lookups = Lookups()
lookups.add_table(table_name, data)
assert len(lookups) == 1
assert table_name in lookups
assert lookups.has_table(table_name)
table = lookups.get_table(table_name)
assert table.name == table_name
assert len(table) == 2
assert table["hello"] == "world"
table["a"] = "b"
assert table["a"] == "b"
table = lookups.get_table(table_name)
assert len(table) == 3
with pytest.raises(KeyError):
lookups.get_table("xyz")
with pytest.raises(ValueError):
lookups.add_table(table_name)
table = lookups.remove_table(table_name)
assert table.name == table_name
assert len(lookups) == 0
assert table_name not in lookups
with pytest.raises(KeyError):
lookups.get_table(table_name)
def test_table_api():
table = Table(name="table")
assert table.name == "table"
assert len(table) == 0
assert "abc" not in table
data = {"foo": "bar", "hello": "world"}
table = Table(name="table", data=data)
assert len(table) == len(data)
assert "foo" in table
assert get_string_id("foo") in table
assert table["foo"] == "bar"
assert table[get_string_id("foo")] == "bar"
assert table.get("foo") == "bar"
assert table.get("abc") is None
table["abc"] = 123
assert table["abc"] == 123
assert table[get_string_id("abc")] == 123
table.set("def", 456)
assert table["def"] == 456
assert table[get_string_id("def")] == 456
def test_table_api_to_from_bytes():
data = {"foo": "bar", "hello": "world", "abc": 123}
table = Table(name="table", data=data)
table_bytes = table.to_bytes()
new_table = Table().from_bytes(table_bytes)
assert new_table.name == "table"
assert len(new_table) == 3
assert new_table["foo"] == "bar"
assert new_table[get_string_id("foo")] == "bar"
new_table2 = Table(data={"def": 456})
new_table2.from_bytes(table_bytes)
assert len(new_table2) == 3
assert "def" not in new_table2
2019-09-15 16:04:44 +00:00
@pytest.mark.skip(reason="This fails on Python 3.5")
def test_lookups_to_from_bytes():
lookups = Lookups()
lookups.add_table("table1", {"foo": "bar", "hello": "world"})
lookups.add_table("table2", {"a": 1, "b": 2, "c": 3})
lookups_bytes = lookups.to_bytes()
new_lookups = Lookups()
new_lookups.from_bytes(lookups_bytes)
assert len(new_lookups) == 2
assert "table1" in new_lookups
assert "table2" in new_lookups
table1 = new_lookups.get_table("table1")
assert len(table1) == 2
assert table1["foo"] == "bar"
table2 = new_lookups.get_table("table2")
assert len(table2) == 3
assert table2["b"] == 2
assert new_lookups.to_bytes() == lookups_bytes
2019-09-11 09:38:22 +00:00
@pytest.mark.skip(reason="This fails on Python 3.5")
def test_lookups_to_from_disk():
lookups = Lookups()
lookups.add_table("table1", {"foo": "bar", "hello": "world"})
lookups.add_table("table2", {"a": 1, "b": 2, "c": 3})
with make_tempdir() as tmpdir:
lookups.to_disk(tmpdir)
new_lookups = Lookups()
new_lookups.from_disk(tmpdir)
assert len(new_lookups) == 2
assert "table1" in new_lookups
assert "table2" in new_lookups
table1 = new_lookups.get_table("table1")
assert len(table1) == 2
assert table1["foo"] == "bar"
table2 = new_lookups.get_table("table2")
assert len(table2) == 3
assert table2["b"] == 2
Bloom-filter backed Lookup Tables (#4268) * Improve load_language_data helper * WIP: Add Lookups implementation * Start moving lemma data over to JSON * WIP: move data over for more languages * Convert more languages * Fix lemmatizer fixtures in tests * Finish conversion * Auto-format JSON files * Fix test for now * Make sure tables are stored on instance * Update docstrings * Update docstrings and errors * Update test * Add Lookups.__len__ * Add serialization methods * Add Lookups.remove_table * Use msgpack for serialization to disk * Fix file exists check * Try using OrderedDict for everything * Update .flake8 [ci skip] * Try fixing serialization * Update test_lookups.py * Update test_serialize_vocab_strings.py * Lookups / Tables now work This implements the stubs in the Lookups/Table classes. Currently this is in Cython but with no type declarations, so that could be improved. * Add lookups to setup.py * Actually add lookups pyx The previous commit added the old py file... * Lookups work-in-progress * Move from pyx back to py * Add string based lookups, fix serialization * Update tests, language/lemmatizer to work with string lookups There are some outstanding issues here: - a pickling-related test fails due to the bloom filter - some custom lemmatizers (fr/nl at least) have issues More generally, there's a question of how to deal with the case where you have a string but want to use the lookup table. Currently the table allows access by string or id, but that's getting pretty awkward. * Change lemmatizer lookup method to pass (orth, string) * Fix token lookup * Fix French lookup * Fix lt lemmatizer test * Fix Dutch lemmatizer * Fix lemmatizer lookup test This was using a normal dict instead of a Table, so checks for the string instead of an integer key failed. * Make uk/nl/ru lemmatizer lookup methods consistent The mentioned tokenizers all have their own implementation of the `lookup` method, which accesses a `Lookups` table. The way that was called in `token.pyx` was changed so this should be updated to have the same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id, string)). Prior to this change tests weren't failing, but there would probably be issues with normal use of a model. More tests should proably be added. Additionally, the language-specific `lookup` implementations seem like they might not be needed, since they handle things like lower-casing that aren't actually language specific. * Make recently added Greek method compatible * Remove redundant class/method Leftovers from a merge not cleaned up adequately.
2019-09-12 15:26:11 +00:00
@pytest.mark.skip(reason="This fails on Python 3.5")
def test_lookups_to_from_bytes_via_vocab():
table_name = "test"
vocab = Vocab()
vocab.lookups.add_table(table_name, {"foo": "bar", "hello": "world"})
assert table_name in vocab.lookups
vocab_bytes = vocab.to_bytes()
new_vocab = Vocab()
new_vocab.from_bytes(vocab_bytes)
Reduce stored lexemes data, move feats to lookups (#5238) * Reduce stored lexemes data, move feats to lookups * Move non-derivable lexemes features (`norm / cluster / prob`) to `spacy-lookups-data` as lookups * Get/set `norm` in both lookups and `LexemeC`, serialize in lookups * Remove `cluster` and `prob` from `LexemesC`, get/set/serialize in lookups only * Remove serialization of lexemes data as `vocab/lexemes.bin` * Remove `SerializedLexemeC` * Remove `Lexeme.to_bytes/from_bytes` * Modify normalization exception loading: * Always create `Vocab.lookups` table `lexeme_norm` for normalization exceptions * Load base exceptions from `lang.norm_exceptions`, but load language-specific exceptions from lookups * Set `lex_attr_getter[NORM]` including new lookups table in `BaseDefaults.create_vocab()` and when deserializing `Vocab` * Remove all cached lexemes when deserializing vocab to override existing normalizations with the new normalizations (as a replacement for the previous step that replaced all lexemes data with the deserialized data) * Skip English normalization test Skip English normalization test because the data is now in `spacy-lookups-data`. * Remove norm exceptions Moved to spacy-lookups-data. * Move norm exceptions test to spacy-lookups-data * Load extra lookups from spacy-lookups-data lazily Load extra lookups (currently for cluster and prob) lazily from the entry point `lg_extra` as `Vocab.lookups_extra`. * Skip creating lexeme cache on load To improve model loading times, do not create the full lexeme cache when loading. The lexemes will be created on demand when processing. * Identify numeric values in Lexeme.set_attrs() With the removal of a special case for `PROB`, also identify `float` to avoid trying to convert it with the `StringStore`. * Skip lexeme cache init in from_bytes * Unskip and update lookups tests for python3.6+ * Update vocab pickle to include lookups_extra * Update vocab serialization tests Check strings rather than lexemes since lexemes aren't initialized automatically, account for addition of "_SP". * Re-skip lookups test because of python3.5 * Skip PROB/float values in Lexeme.set_attrs * Convert is_oov from lexeme flag to lex in vectors Instead of storing `is_oov` as a lexeme flag, `is_oov` reports whether the lexeme has a vector. Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
2020-05-19 13:59:14 +00:00
assert len(new_vocab.lookups) == len(vocab.lookups)
assert table_name in new_vocab.lookups
table = new_vocab.lookups.get_table(table_name)
assert len(table) == 2
assert table["hello"] == "world"
assert new_vocab.to_bytes() == vocab_bytes
2019-09-14 10:58:06 +00:00
@pytest.mark.skip(reason="This fails on Python 3.5")
def test_lookups_to_from_disk_via_vocab():
table_name = "test"
vocab = Vocab()
vocab.lookups.add_table(table_name, {"foo": "bar", "hello": "world"})
assert table_name in vocab.lookups
with make_tempdir() as tmpdir:
vocab.to_disk(tmpdir)
new_vocab = Vocab()
new_vocab.from_disk(tmpdir)
Reduce stored lexemes data, move feats to lookups (#5238) * Reduce stored lexemes data, move feats to lookups * Move non-derivable lexemes features (`norm / cluster / prob`) to `spacy-lookups-data` as lookups * Get/set `norm` in both lookups and `LexemeC`, serialize in lookups * Remove `cluster` and `prob` from `LexemesC`, get/set/serialize in lookups only * Remove serialization of lexemes data as `vocab/lexemes.bin` * Remove `SerializedLexemeC` * Remove `Lexeme.to_bytes/from_bytes` * Modify normalization exception loading: * Always create `Vocab.lookups` table `lexeme_norm` for normalization exceptions * Load base exceptions from `lang.norm_exceptions`, but load language-specific exceptions from lookups * Set `lex_attr_getter[NORM]` including new lookups table in `BaseDefaults.create_vocab()` and when deserializing `Vocab` * Remove all cached lexemes when deserializing vocab to override existing normalizations with the new normalizations (as a replacement for the previous step that replaced all lexemes data with the deserialized data) * Skip English normalization test Skip English normalization test because the data is now in `spacy-lookups-data`. * Remove norm exceptions Moved to spacy-lookups-data. * Move norm exceptions test to spacy-lookups-data * Load extra lookups from spacy-lookups-data lazily Load extra lookups (currently for cluster and prob) lazily from the entry point `lg_extra` as `Vocab.lookups_extra`. * Skip creating lexeme cache on load To improve model loading times, do not create the full lexeme cache when loading. The lexemes will be created on demand when processing. * Identify numeric values in Lexeme.set_attrs() With the removal of a special case for `PROB`, also identify `float` to avoid trying to convert it with the `StringStore`. * Skip lexeme cache init in from_bytes * Unskip and update lookups tests for python3.6+ * Update vocab pickle to include lookups_extra * Update vocab serialization tests Check strings rather than lexemes since lexemes aren't initialized automatically, account for addition of "_SP". * Re-skip lookups test because of python3.5 * Skip PROB/float values in Lexeme.set_attrs * Convert is_oov from lexeme flag to lex in vectors Instead of storing `is_oov` as a lexeme flag, `is_oov` reports whether the lexeme has a vector. Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
2020-05-19 13:59:14 +00:00
assert len(new_vocab.lookups) == len(vocab.lookups)
assert table_name in new_vocab.lookups
table = new_vocab.lookups.get_table(table_name)
assert len(table) == 2
assert table["hello"] == "world"