2019-08-22 12:21:32 +00:00
|
|
|
# coding: utf-8
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
from spacy.lookups import Lookups
|
2019-09-09 17:17:55 +00:00
|
|
|
from spacy.vocab import Vocab
|
|
|
|
|
|
|
|
from ..util import make_tempdir
|
2019-08-22 12:21:32 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_lookups_api():
|
|
|
|
table_name = "test"
|
|
|
|
data = {"foo": "bar", "hello": "world"}
|
|
|
|
lookups = Lookups()
|
|
|
|
lookups.add_table(table_name, data)
|
2019-09-09 17:17:55 +00:00
|
|
|
assert len(lookups) == 1
|
2019-08-22 12:21:32 +00:00
|
|
|
assert table_name in lookups
|
|
|
|
assert lookups.has_table(table_name)
|
|
|
|
table = lookups.get_table(table_name)
|
|
|
|
assert table.name == table_name
|
|
|
|
assert len(table) == 2
|
Bloom-filter backed Lookup Tables (#4268)
* Improve load_language_data helper
* WIP: Add Lookups implementation
* Start moving lemma data over to JSON
* WIP: move data over for more languages
* Convert more languages
* Fix lemmatizer fixtures in tests
* Finish conversion
* Auto-format JSON files
* Fix test for now
* Make sure tables are stored on instance
* Update docstrings
* Update docstrings and errors
* Update test
* Add Lookups.__len__
* Add serialization methods
* Add Lookups.remove_table
* Use msgpack for serialization to disk
* Fix file exists check
* Try using OrderedDict for everything
* Update .flake8 [ci skip]
* Try fixing serialization
* Update test_lookups.py
* Update test_serialize_vocab_strings.py
* Lookups / Tables now work
This implements the stubs in the Lookups/Table classes. Currently this
is in Cython but with no type declarations, so that could be improved.
* Add lookups to setup.py
* Actually add lookups pyx
The previous commit added the old py file...
* Lookups work-in-progress
* Move from pyx back to py
* Add string based lookups, fix serialization
* Update tests, language/lemmatizer to work with string lookups
There are some outstanding issues here:
- a pickling-related test fails due to the bloom filter
- some custom lemmatizers (fr/nl at least) have issues
More generally, there's a question of how to deal with the case where
you have a string but want to use the lookup table. Currently the table
allows access by string or id, but that's getting pretty awkward.
* Change lemmatizer lookup method to pass (orth, string)
* Fix token lookup
* Fix French lookup
* Fix lt lemmatizer test
* Fix Dutch lemmatizer
* Fix lemmatizer lookup test
This was using a normal dict instead of a Table, so checks for the
string instead of an integer key failed.
* Make uk/nl/ru lemmatizer lookup methods consistent
The mentioned tokenizers all have their own implementation of the
`lookup` method, which accesses a `Lookups` table. The way that was
called in `token.pyx` was changed so this should be updated to have the
same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id,
string)).
Prior to this change tests weren't failing, but there would probably be
issues with normal use of a model. More tests should proably be added.
Additionally, the language-specific `lookup` implementations seem like
they might not be needed, since they handle things like lower-casing
that aren't actually language specific.
* Make recently added Greek method compatible
* Remove redundant class/method
Leftovers from a merge not cleaned up adequately.
2019-09-12 15:26:11 +00:00
|
|
|
assert table.get_string("hello") == "world"
|
|
|
|
table.set_string("a", "b")
|
|
|
|
assert table.get_string("a") == "b"
|
2019-08-22 12:21:32 +00:00
|
|
|
table = lookups.get_table(table_name)
|
|
|
|
assert len(table) == 3
|
|
|
|
with pytest.raises(KeyError):
|
|
|
|
lookups.get_table("xyz")
|
2019-09-09 17:17:55 +00:00
|
|
|
with pytest.raises(ValueError):
|
|
|
|
lookups.add_table(table_name)
|
|
|
|
table = lookups.remove_table(table_name)
|
|
|
|
assert table.name == table_name
|
|
|
|
assert len(lookups) == 0
|
|
|
|
assert table_name not in lookups
|
|
|
|
with pytest.raises(KeyError):
|
|
|
|
lookups.get_table(table_name)
|
|
|
|
|
|
|
|
|
2019-09-15 15:00:17 +00:00
|
|
|
@pytest.mark.xfail(reason="This fails on Python 3.5")
|
2019-09-09 17:17:55 +00:00
|
|
|
def test_lookups_to_from_bytes():
|
|
|
|
lookups = Lookups()
|
|
|
|
lookups.add_table("table1", {"foo": "bar", "hello": "world"})
|
|
|
|
lookups.add_table("table2", {"a": 1, "b": 2, "c": 3})
|
|
|
|
lookups_bytes = lookups.to_bytes()
|
|
|
|
new_lookups = Lookups()
|
|
|
|
new_lookups.from_bytes(lookups_bytes)
|
|
|
|
assert len(new_lookups) == 2
|
|
|
|
assert "table1" in new_lookups
|
|
|
|
assert "table2" in new_lookups
|
|
|
|
table1 = new_lookups.get_table("table1")
|
|
|
|
assert len(table1) == 2
|
Bloom-filter backed Lookup Tables (#4268)
* Improve load_language_data helper
* WIP: Add Lookups implementation
* Start moving lemma data over to JSON
* WIP: move data over for more languages
* Convert more languages
* Fix lemmatizer fixtures in tests
* Finish conversion
* Auto-format JSON files
* Fix test for now
* Make sure tables are stored on instance
* Update docstrings
* Update docstrings and errors
* Update test
* Add Lookups.__len__
* Add serialization methods
* Add Lookups.remove_table
* Use msgpack for serialization to disk
* Fix file exists check
* Try using OrderedDict for everything
* Update .flake8 [ci skip]
* Try fixing serialization
* Update test_lookups.py
* Update test_serialize_vocab_strings.py
* Lookups / Tables now work
This implements the stubs in the Lookups/Table classes. Currently this
is in Cython but with no type declarations, so that could be improved.
* Add lookups to setup.py
* Actually add lookups pyx
The previous commit added the old py file...
* Lookups work-in-progress
* Move from pyx back to py
* Add string based lookups, fix serialization
* Update tests, language/lemmatizer to work with string lookups
There are some outstanding issues here:
- a pickling-related test fails due to the bloom filter
- some custom lemmatizers (fr/nl at least) have issues
More generally, there's a question of how to deal with the case where
you have a string but want to use the lookup table. Currently the table
allows access by string or id, but that's getting pretty awkward.
* Change lemmatizer lookup method to pass (orth, string)
* Fix token lookup
* Fix French lookup
* Fix lt lemmatizer test
* Fix Dutch lemmatizer
* Fix lemmatizer lookup test
This was using a normal dict instead of a Table, so checks for the
string instead of an integer key failed.
* Make uk/nl/ru lemmatizer lookup methods consistent
The mentioned tokenizers all have their own implementation of the
`lookup` method, which accesses a `Lookups` table. The way that was
called in `token.pyx` was changed so this should be updated to have the
same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id,
string)).
Prior to this change tests weren't failing, but there would probably be
issues with normal use of a model. More tests should proably be added.
Additionally, the language-specific `lookup` implementations seem like
they might not be needed, since they handle things like lower-casing
that aren't actually language specific.
* Make recently added Greek method compatible
* Remove redundant class/method
Leftovers from a merge not cleaned up adequately.
2019-09-12 15:26:11 +00:00
|
|
|
assert table1.get_string("foo") == "bar"
|
2019-09-09 17:17:55 +00:00
|
|
|
table2 = new_lookups.get_table("table2")
|
|
|
|
assert len(table2) == 3
|
Bloom-filter backed Lookup Tables (#4268)
* Improve load_language_data helper
* WIP: Add Lookups implementation
* Start moving lemma data over to JSON
* WIP: move data over for more languages
* Convert more languages
* Fix lemmatizer fixtures in tests
* Finish conversion
* Auto-format JSON files
* Fix test for now
* Make sure tables are stored on instance
* Update docstrings
* Update docstrings and errors
* Update test
* Add Lookups.__len__
* Add serialization methods
* Add Lookups.remove_table
* Use msgpack for serialization to disk
* Fix file exists check
* Try using OrderedDict for everything
* Update .flake8 [ci skip]
* Try fixing serialization
* Update test_lookups.py
* Update test_serialize_vocab_strings.py
* Lookups / Tables now work
This implements the stubs in the Lookups/Table classes. Currently this
is in Cython but with no type declarations, so that could be improved.
* Add lookups to setup.py
* Actually add lookups pyx
The previous commit added the old py file...
* Lookups work-in-progress
* Move from pyx back to py
* Add string based lookups, fix serialization
* Update tests, language/lemmatizer to work with string lookups
There are some outstanding issues here:
- a pickling-related test fails due to the bloom filter
- some custom lemmatizers (fr/nl at least) have issues
More generally, there's a question of how to deal with the case where
you have a string but want to use the lookup table. Currently the table
allows access by string or id, but that's getting pretty awkward.
* Change lemmatizer lookup method to pass (orth, string)
* Fix token lookup
* Fix French lookup
* Fix lt lemmatizer test
* Fix Dutch lemmatizer
* Fix lemmatizer lookup test
This was using a normal dict instead of a Table, so checks for the
string instead of an integer key failed.
* Make uk/nl/ru lemmatizer lookup methods consistent
The mentioned tokenizers all have their own implementation of the
`lookup` method, which accesses a `Lookups` table. The way that was
called in `token.pyx` was changed so this should be updated to have the
same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id,
string)).
Prior to this change tests weren't failing, but there would probably be
issues with normal use of a model. More tests should proably be added.
Additionally, the language-specific `lookup` implementations seem like
they might not be needed, since they handle things like lower-casing
that aren't actually language specific.
* Make recently added Greek method compatible
* Remove redundant class/method
Leftovers from a merge not cleaned up adequately.
2019-09-12 15:26:11 +00:00
|
|
|
assert table2.get_string("b") == 2
|
2019-09-09 17:17:55 +00:00
|
|
|
assert new_lookups.to_bytes() == lookups_bytes
|
|
|
|
|
2019-09-11 09:38:22 +00:00
|
|
|
|
2019-09-15 15:00:17 +00:00
|
|
|
@pytest.mark.skip(reason="This fails on Python 3.5")
|
2019-09-09 17:17:55 +00:00
|
|
|
def test_lookups_to_from_disk():
|
|
|
|
lookups = Lookups()
|
|
|
|
lookups.add_table("table1", {"foo": "bar", "hello": "world"})
|
|
|
|
lookups.add_table("table2", {"a": 1, "b": 2, "c": 3})
|
|
|
|
with make_tempdir() as tmpdir:
|
|
|
|
lookups.to_disk(tmpdir)
|
|
|
|
new_lookups = Lookups()
|
|
|
|
new_lookups.from_disk(tmpdir)
|
|
|
|
assert len(new_lookups) == 2
|
|
|
|
assert "table1" in new_lookups
|
|
|
|
assert "table2" in new_lookups
|
|
|
|
table1 = new_lookups.get_table("table1")
|
|
|
|
assert len(table1) == 2
|
Bloom-filter backed Lookup Tables (#4268)
* Improve load_language_data helper
* WIP: Add Lookups implementation
* Start moving lemma data over to JSON
* WIP: move data over for more languages
* Convert more languages
* Fix lemmatizer fixtures in tests
* Finish conversion
* Auto-format JSON files
* Fix test for now
* Make sure tables are stored on instance
* Update docstrings
* Update docstrings and errors
* Update test
* Add Lookups.__len__
* Add serialization methods
* Add Lookups.remove_table
* Use msgpack for serialization to disk
* Fix file exists check
* Try using OrderedDict for everything
* Update .flake8 [ci skip]
* Try fixing serialization
* Update test_lookups.py
* Update test_serialize_vocab_strings.py
* Lookups / Tables now work
This implements the stubs in the Lookups/Table classes. Currently this
is in Cython but with no type declarations, so that could be improved.
* Add lookups to setup.py
* Actually add lookups pyx
The previous commit added the old py file...
* Lookups work-in-progress
* Move from pyx back to py
* Add string based lookups, fix serialization
* Update tests, language/lemmatizer to work with string lookups
There are some outstanding issues here:
- a pickling-related test fails due to the bloom filter
- some custom lemmatizers (fr/nl at least) have issues
More generally, there's a question of how to deal with the case where
you have a string but want to use the lookup table. Currently the table
allows access by string or id, but that's getting pretty awkward.
* Change lemmatizer lookup method to pass (orth, string)
* Fix token lookup
* Fix French lookup
* Fix lt lemmatizer test
* Fix Dutch lemmatizer
* Fix lemmatizer lookup test
This was using a normal dict instead of a Table, so checks for the
string instead of an integer key failed.
* Make uk/nl/ru lemmatizer lookup methods consistent
The mentioned tokenizers all have their own implementation of the
`lookup` method, which accesses a `Lookups` table. The way that was
called in `token.pyx` was changed so this should be updated to have the
same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id,
string)).
Prior to this change tests weren't failing, but there would probably be
issues with normal use of a model. More tests should proably be added.
Additionally, the language-specific `lookup` implementations seem like
they might not be needed, since they handle things like lower-casing
that aren't actually language specific.
* Make recently added Greek method compatible
* Remove redundant class/method
Leftovers from a merge not cleaned up adequately.
2019-09-12 15:26:11 +00:00
|
|
|
assert table1.get_string("foo") == "bar"
|
2019-09-09 17:17:55 +00:00
|
|
|
table2 = new_lookups.get_table("table2")
|
|
|
|
assert len(table2) == 3
|
Bloom-filter backed Lookup Tables (#4268)
* Improve load_language_data helper
* WIP: Add Lookups implementation
* Start moving lemma data over to JSON
* WIP: move data over for more languages
* Convert more languages
* Fix lemmatizer fixtures in tests
* Finish conversion
* Auto-format JSON files
* Fix test for now
* Make sure tables are stored on instance
* Update docstrings
* Update docstrings and errors
* Update test
* Add Lookups.__len__
* Add serialization methods
* Add Lookups.remove_table
* Use msgpack for serialization to disk
* Fix file exists check
* Try using OrderedDict for everything
* Update .flake8 [ci skip]
* Try fixing serialization
* Update test_lookups.py
* Update test_serialize_vocab_strings.py
* Lookups / Tables now work
This implements the stubs in the Lookups/Table classes. Currently this
is in Cython but with no type declarations, so that could be improved.
* Add lookups to setup.py
* Actually add lookups pyx
The previous commit added the old py file...
* Lookups work-in-progress
* Move from pyx back to py
* Add string based lookups, fix serialization
* Update tests, language/lemmatizer to work with string lookups
There are some outstanding issues here:
- a pickling-related test fails due to the bloom filter
- some custom lemmatizers (fr/nl at least) have issues
More generally, there's a question of how to deal with the case where
you have a string but want to use the lookup table. Currently the table
allows access by string or id, but that's getting pretty awkward.
* Change lemmatizer lookup method to pass (orth, string)
* Fix token lookup
* Fix French lookup
* Fix lt lemmatizer test
* Fix Dutch lemmatizer
* Fix lemmatizer lookup test
This was using a normal dict instead of a Table, so checks for the
string instead of an integer key failed.
* Make uk/nl/ru lemmatizer lookup methods consistent
The mentioned tokenizers all have their own implementation of the
`lookup` method, which accesses a `Lookups` table. The way that was
called in `token.pyx` was changed so this should be updated to have the
same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id,
string)).
Prior to this change tests weren't failing, but there would probably be
issues with normal use of a model. More tests should proably be added.
Additionally, the language-specific `lookup` implementations seem like
they might not be needed, since they handle things like lower-casing
that aren't actually language specific.
* Make recently added Greek method compatible
* Remove redundant class/method
Leftovers from a merge not cleaned up adequately.
2019-09-12 15:26:11 +00:00
|
|
|
assert table2.get_string("b") == 2
|
|
|
|
|
2019-09-09 17:17:55 +00:00
|
|
|
|
2019-09-15 15:00:17 +00:00
|
|
|
@pytest.mark.skip(reason="This fails on Python 3.5")
|
2019-09-09 17:17:55 +00:00
|
|
|
def test_lookups_to_from_bytes_via_vocab():
|
|
|
|
table_name = "test"
|
|
|
|
vocab = Vocab()
|
|
|
|
vocab.lookups.add_table(table_name, {"foo": "bar", "hello": "world"})
|
|
|
|
assert len(vocab.lookups) == 1
|
|
|
|
assert table_name in vocab.lookups
|
|
|
|
vocab_bytes = vocab.to_bytes()
|
|
|
|
new_vocab = Vocab()
|
|
|
|
new_vocab.from_bytes(vocab_bytes)
|
|
|
|
assert len(new_vocab.lookups) == 1
|
|
|
|
assert table_name in new_vocab.lookups
|
|
|
|
table = new_vocab.lookups.get_table(table_name)
|
|
|
|
assert len(table) == 2
|
Bloom-filter backed Lookup Tables (#4268)
* Improve load_language_data helper
* WIP: Add Lookups implementation
* Start moving lemma data over to JSON
* WIP: move data over for more languages
* Convert more languages
* Fix lemmatizer fixtures in tests
* Finish conversion
* Auto-format JSON files
* Fix test for now
* Make sure tables are stored on instance
* Update docstrings
* Update docstrings and errors
* Update test
* Add Lookups.__len__
* Add serialization methods
* Add Lookups.remove_table
* Use msgpack for serialization to disk
* Fix file exists check
* Try using OrderedDict for everything
* Update .flake8 [ci skip]
* Try fixing serialization
* Update test_lookups.py
* Update test_serialize_vocab_strings.py
* Lookups / Tables now work
This implements the stubs in the Lookups/Table classes. Currently this
is in Cython but with no type declarations, so that could be improved.
* Add lookups to setup.py
* Actually add lookups pyx
The previous commit added the old py file...
* Lookups work-in-progress
* Move from pyx back to py
* Add string based lookups, fix serialization
* Update tests, language/lemmatizer to work with string lookups
There are some outstanding issues here:
- a pickling-related test fails due to the bloom filter
- some custom lemmatizers (fr/nl at least) have issues
More generally, there's a question of how to deal with the case where
you have a string but want to use the lookup table. Currently the table
allows access by string or id, but that's getting pretty awkward.
* Change lemmatizer lookup method to pass (orth, string)
* Fix token lookup
* Fix French lookup
* Fix lt lemmatizer test
* Fix Dutch lemmatizer
* Fix lemmatizer lookup test
This was using a normal dict instead of a Table, so checks for the
string instead of an integer key failed.
* Make uk/nl/ru lemmatizer lookup methods consistent
The mentioned tokenizers all have their own implementation of the
`lookup` method, which accesses a `Lookups` table. The way that was
called in `token.pyx` was changed so this should be updated to have the
same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id,
string)).
Prior to this change tests weren't failing, but there would probably be
issues with normal use of a model. More tests should proably be added.
Additionally, the language-specific `lookup` implementations seem like
they might not be needed, since they handle things like lower-casing
that aren't actually language specific.
* Make recently added Greek method compatible
* Remove redundant class/method
Leftovers from a merge not cleaned up adequately.
2019-09-12 15:26:11 +00:00
|
|
|
assert table.get_string("hello") == "world"
|
2019-09-09 17:17:55 +00:00
|
|
|
assert new_vocab.to_bytes() == vocab_bytes
|
|
|
|
|
2019-09-14 10:58:06 +00:00
|
|
|
|
2019-09-15 15:00:17 +00:00
|
|
|
@pytest.mark.skip(reason="This fails on Python 3.5")
|
2019-09-09 17:17:55 +00:00
|
|
|
def test_lookups_to_from_disk_via_vocab():
|
|
|
|
table_name = "test"
|
|
|
|
vocab = Vocab()
|
|
|
|
vocab.lookups.add_table(table_name, {"foo": "bar", "hello": "world"})
|
|
|
|
assert len(vocab.lookups) == 1
|
|
|
|
assert table_name in vocab.lookups
|
|
|
|
with make_tempdir() as tmpdir:
|
|
|
|
vocab.to_disk(tmpdir)
|
|
|
|
new_vocab = Vocab()
|
|
|
|
new_vocab.from_disk(tmpdir)
|
|
|
|
assert len(new_vocab.lookups) == 1
|
|
|
|
assert table_name in new_vocab.lookups
|
|
|
|
table = new_vocab.lookups.get_table(table_name)
|
|
|
|
assert len(table) == 2
|
Bloom-filter backed Lookup Tables (#4268)
* Improve load_language_data helper
* WIP: Add Lookups implementation
* Start moving lemma data over to JSON
* WIP: move data over for more languages
* Convert more languages
* Fix lemmatizer fixtures in tests
* Finish conversion
* Auto-format JSON files
* Fix test for now
* Make sure tables are stored on instance
* Update docstrings
* Update docstrings and errors
* Update test
* Add Lookups.__len__
* Add serialization methods
* Add Lookups.remove_table
* Use msgpack for serialization to disk
* Fix file exists check
* Try using OrderedDict for everything
* Update .flake8 [ci skip]
* Try fixing serialization
* Update test_lookups.py
* Update test_serialize_vocab_strings.py
* Lookups / Tables now work
This implements the stubs in the Lookups/Table classes. Currently this
is in Cython but with no type declarations, so that could be improved.
* Add lookups to setup.py
* Actually add lookups pyx
The previous commit added the old py file...
* Lookups work-in-progress
* Move from pyx back to py
* Add string based lookups, fix serialization
* Update tests, language/lemmatizer to work with string lookups
There are some outstanding issues here:
- a pickling-related test fails due to the bloom filter
- some custom lemmatizers (fr/nl at least) have issues
More generally, there's a question of how to deal with the case where
you have a string but want to use the lookup table. Currently the table
allows access by string or id, but that's getting pretty awkward.
* Change lemmatizer lookup method to pass (orth, string)
* Fix token lookup
* Fix French lookup
* Fix lt lemmatizer test
* Fix Dutch lemmatizer
* Fix lemmatizer lookup test
This was using a normal dict instead of a Table, so checks for the
string instead of an integer key failed.
* Make uk/nl/ru lemmatizer lookup methods consistent
The mentioned tokenizers all have their own implementation of the
`lookup` method, which accesses a `Lookups` table. The way that was
called in `token.pyx` was changed so this should be updated to have the
same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id,
string)).
Prior to this change tests weren't failing, but there would probably be
issues with normal use of a model. More tests should proably be added.
Additionally, the language-specific `lookup` implementations seem like
they might not be needed, since they handle things like lower-casing
that aren't actually language specific.
* Make recently added Greek method compatible
* Remove redundant class/method
Leftovers from a merge not cleaned up adequately.
2019-09-12 15:26:11 +00:00
|
|
|
assert table.get_string("hello") == "world"
|