diff --git a/setup.py b/setup.py index 1609341b7..c51ef8c44 100644 --- a/setup.py +++ b/setup.py @@ -42,6 +42,7 @@ PACKAGES = [ 'spacy.tests.parser', 'spacy.tests.serialize', 'spacy.tests.spans', + 'spacy.tests.stringstore', 'spacy.tests.tagger', 'spacy.tests.tokenizer', 'spacy.tests.doc', diff --git a/spacy/tests/morphology/__init__.py b/spacy/tests/stringstore/__init__.py similarity index 100% rename from spacy/tests/morphology/__init__.py rename to spacy/tests/stringstore/__init__.py diff --git a/spacy/tests/stringstore/test_freeze_string_store.py b/spacy/tests/stringstore/test_freeze_string_store.py new file mode 100644 index 000000000..96d7912b2 --- /dev/null +++ b/spacy/tests/stringstore/test_freeze_string_store.py @@ -0,0 +1,23 @@ +# coding: utf-8 +"""Test the possibly temporary workaround of flushing the stringstore of OOV words.""" + + +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('text', [["a", "b", "c"]]) +def test_stringstore_freeze_oov(stringstore, text): + assert stringstore[text[0]] == 1 + assert stringstore[text[1]] == 2 + + stringstore.set_frozen(True) + s = stringstore[text[2]] + assert s >= 4 + s_ = stringstore[s] + assert s_ == text[2] + + stringstore.flush_oov() + with pytest.raises(IndexError): + s_ = stringstore[s] diff --git a/spacy/tests/stringstore/test_stringstore.py b/spacy/tests/stringstore/test_stringstore.py new file mode 100644 index 000000000..ebbec01d9 --- /dev/null +++ b/spacy/tests/stringstore/test_stringstore.py @@ -0,0 +1,78 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from ...strings import StringStore + +import pytest + + +@pytest.mark.parametrize('text1,text2,text3', [(b'Hello', b'goodbye', b'hello')]) +def test_stringstore_save_bytes(stringstore, text1, text2, text3): + i = stringstore[text1] + assert i == 1 + assert stringstore[text1] == 1 + assert stringstore[text2] != i + assert stringstore[text3] != i + assert i == 1 + + +@pytest.mark.parametrize('text1,text2,text3', [('Hello', 'goodbye', 'hello')]) +def test_stringstore_save_unicode(stringstore, text1, text2, text3): + i = stringstore[text1] + assert i == 1 + assert stringstore[text1] == 1 + assert stringstore[text2] != i + assert stringstore[text3] != i + assert i == 1 + + +@pytest.mark.parametrize('text', [b'A']) +def test_stringstore_retrieve_id(stringstore, text): + i = stringstore[text] + assert stringstore.size == 1 + assert stringstore[1] == text.decode('utf8') + with pytest.raises(IndexError): + stringstore[2] + + +@pytest.mark.parametrize('text1,text2', [(b'0123456789', b'A')]) +def test_stringstore_med_string(stringstore, text1, text2): + store = stringstore[text1] + assert stringstore[store] == text1.decode('utf8') + dummy = stringstore[text2] + assert stringstore[text1] == store + + +def test_stringstore_long_string(stringstore): + text = "INFORMATIVE](http://www.google.com/search?as_q=RedditMonkey&hl=en&num=50&btnG=Google+Search&as_epq=&as_oq=&as_eq=&lr=&as_ft=i&as_filetype=&as_qdr=all&as_nlo=&as_nhi=&as_occt=any&as_dt=i&as_sitesearch=&as_rights=&safe=off" + store = stringstore[text] + assert stringstore[store] == text + + +@pytest.mark.parametrize('factor', [254, 255, 256]) +def test_stringstore_multiply(stringstore, factor): + text = 'a' * factor + store = stringstore[text] + assert stringstore[store] == text + + +def test_stringstore_massive_strings(stringstore): + text = 'a' * 511 + store = stringstore[text] + assert stringstore[store] == text + text2 = 'z' * 512 + store = stringstore[text2] + assert stringstore[store] == text2 + text3 = '1' * 513 + store = stringstore[text3] + assert stringstore[store] == text3 + + +@pytest.mark.parametrize('text', ["qqqqq"]) +def test_stringstore_dump_load(stringstore, text_file, text): + store = stringstore[text] + stringstore.dump(text_file) + text_file.seek(0) + new_stringstore = StringStore() + new_stringstore.load(text_file) + assert new_stringstore[store] == text diff --git a/spacy/tests/vocab/test_freeze_string_store.py b/spacy/tests/vocab/test_freeze_string_store.py deleted file mode 100644 index 2e8e13153..000000000 --- a/spacy/tests/vocab/test_freeze_string_store.py +++ /dev/null @@ -1,21 +0,0 @@ -'''Test the possibly temporary workaround of flushing the stringstore of OOV words.''' -import pytest - -from ...strings import StringStore - - -def test_oov(): - strings = StringStore() - a = strings[u'a'] - b = strings[u'b'] - - assert a == 1 - assert b == 2 - strings.set_frozen(True) - c = strings[u'c'] - assert c >= 4 - c_ = strings[c] - assert c_ == u'c' - strings.flush_oov() - with pytest.raises(IndexError): - c_ = strings[c]