spaCy/spacy/tests/tokenizer/test_tokenizer.py

# coding: utf-8
from __future__ import unicode_literals

import pytest
from spacy.vocab import Vocab
from spacy.tokenizer import Tokenizer
from spacy.util import ensure_path


def test_tokenizer_handles_no_word(tokenizer):
    tokens = tokenizer("")
    assert len(tokens) == 0


@pytest.mark.parametrize('text', ["lorem"])
def test_tokenizer_handles_single_word(tokenizer, text):
    tokens = tokenizer(text)
    assert tokens[0].text == text


def test_tokenizer_handles_punct(tokenizer):
    text = "Lorem, ipsum."
    tokens = tokenizer(text)
    assert len(tokens) == 4
    assert tokens[0].text == "Lorem"
    assert tokens[1].text == ","
    assert tokens[2].text == "ipsum"
    assert tokens[1].text != "Lorem"


def test_tokenizer_handles_digits(tokenizer):
    exceptions = ["hu", "bn"]
    text = "Lorem ipsum: 1984."
    tokens = tokenizer(text)

    if tokens[0].lang_ not in exceptions:
        assert len(tokens) == 5
        assert tokens[0].text == "Lorem"
        assert tokens[3].text == "1984"


@pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai", "http://www.google.com"])
def test_tokenizer_keep_urls(tokenizer, text):
    tokens = tokenizer(text)
    assert len(tokens) == 1


@pytest.mark.parametrize('text', ["NASDAQ:GOOG"])
def test_tokenizer_colons(tokenizer, text):
    tokens = tokenizer(text)
    assert len(tokens) == 3


@pytest.mark.parametrize('text', ["hello123@example.com", "hi+there@gmail.it", "matt@explosion.ai"])
def test_tokenizer_keeps_email(tokenizer, text):
    tokens = tokenizer(text)
    assert len(tokens) == 1


def test_tokenizer_handles_long_text(tokenizer):
    text = """Lorem ipsum dolor sit amet, consectetur adipiscing elit

Cras egestas orci non porttitor maximus.
Maecenas quis odio id dolor rhoncus dignissim. Curabitur sed velit at orci ultrices sagittis. Nulla commodo euismod arcu eget vulputate.

Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, non lacinia enim nibh eget ipsum. Vestibulum in bibendum mauris.

"Nullam porta fringilla enim, a dictum orci consequat in." Mauris nec malesuada justo."""

    tokens = tokenizer(text)
    assert len(tokens) > 5


@pytest.mark.parametrize('file_name', ["sun.txt"])
def test_tokenizer_handle_text_from_file(tokenizer, file_name):
    loc = ensure_path(__file__).parent / file_name
    text = loc.open('r', encoding='utf8').read()
    assert len(text) != 0
    tokens = tokenizer(text)
    assert len(tokens) > 100


def test_tokenizer_suspected_freeing_strings(tokenizer):
    text1 = "Lorem dolor sit amet, consectetur adipiscing elit."
    text2 = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
    tokens1 = tokenizer(text1)
    tokens2 = tokenizer(text2)
    assert tokens1[0].text == "Lorem"
    assert tokens2[0].text == "Lorem"


@pytest.mark.parametrize('text,tokens', [
    ("lorem", [{'orth': 'lo'}, {'orth': 'rem'}])])
def test_tokenizer_add_special_case(tokenizer, text, tokens):
    tokenizer.add_special_case(text, tokens)
    doc = tokenizer(text)
    assert doc[0].text == tokens[0]['orth']
    assert doc[1].text == tokens[1]['orth']


@pytest.mark.parametrize('text,tokens', [
    ("lorem", [{'orth': 'lo', 'tag': 'NN'}, {'orth': 'rem'}])])
def test_tokenizer_add_special_case_tag(text, tokens):
    vocab = Vocab(tag_map={'NN': {'pos': 'NOUN'}})
    tokenizer = Tokenizer(vocab, {}, None, None, None)
    tokenizer.add_special_case(text, tokens)
    doc = tokenizer(text)
    assert doc[0].text == tokens[0]['orth']
    assert doc[0].tag_ == tokens[0]['tag']
    assert doc[0].pos_ == 'NOUN'
    assert doc[1].text == tokens[1]['orth']
Add general sanity tests for all tokenizers 2017-01-05 15:25:38 +00:00			`# coding: utf-8`
			`from __future__ import unicode_literals`

Merge tokenizer tests 2017-01-13 00:34:14 +00:00			`import pytest`
💫 Refactor test suite (#2568) ## Description Related issues: #2379 (should be fixed by separating model tests) * total execution time down from > 300 seconds to under 60 seconds 🎉 * removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure * changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version) * merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways) * tidied up and rewrote existing tests wherever possible ### Todo - [ ] move tests to `/tests` and adjust CI commands accordingly - [x] move model test suite from internal repo to `spacy-models` - [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~ - [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted - [ ] update documentation on how to run tests ### Types of change enhancement, tests ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information. 2018-07-24 21:38:44 +00:00			`from spacy.vocab import Vocab`
			`from spacy.tokenizer import Tokenizer`
			`from spacy.util import ensure_path`
Merge tokenizer tests 2017-01-13 00:34:14 +00:00
Add general sanity tests for all tokenizers 2017-01-05 15:25:38 +00:00
			`def test_tokenizer_handles_no_word(tokenizer):`
			`tokens = tokenizer("")`
			`assert len(tokens) == 0`


			`@pytest.mark.parametrize('text', ["lorem"])`
			`def test_tokenizer_handles_single_word(tokenizer, text):`
			`tokens = tokenizer(text)`
			`assert tokens[0].text == text`


			`def test_tokenizer_handles_punct(tokenizer):`
			`text = "Lorem, ipsum."`
			`tokens = tokenizer(text)`
			`assert len(tokens) == 4`
			`assert tokens[0].text == "Lorem"`
			`assert tokens[1].text == ","`
			`assert tokens[2].text == "ipsum"`
			`assert tokens[1].text != "Lorem"`


			`def test_tokenizer_handles_digits(tokenizer):`
add tests for Bengali 2017-03-05 01:11:26 +00:00			`exceptions = ["hu", "bn"]`
Add general sanity tests for all tokenizers 2017-01-05 15:25:38 +00:00			`text = "Lorem ipsum: 1984."`
			`tokens = tokenizer(text)`

			`if tokens[0].lang_ not in exceptions:`
			`assert len(tokens) == 5`
			`assert tokens[0].text == "Lorem"`
			`assert tokens[3].text == "1984"`


Fix error in test case parameterization 2017-03-09 19:18:21 +00:00			`@pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai", "http://www.google.com"])`
Move non-English-specific tests back to general tokenizer tests 2017-01-05 17:09:29 +00:00			`def test_tokenizer_keep_urls(tokenizer, text):`
			`tokens = tokenizer(text)`
			`assert len(tokens) == 1`


Issue #840 - URL pattenr too broad 2017-03-04 22:13:11 +00:00			`@pytest.mark.parametrize('text', ["NASDAQ:GOOG"])`
			`def test_tokenizer_colons(tokenizer, text):`
			`tokens = tokenizer(text)`
			`assert len(tokens) == 3`


Move non-English-specific tests back to general tokenizer tests 2017-01-05 17:09:29 +00:00			`@pytest.mark.parametrize('text', ["hello123@example.com", "hi+there@gmail.it", "matt@explosion.ai"])`
			`def test_tokenizer_keeps_email(tokenizer, text):`
			`tokens = tokenizer(text)`
			`assert len(tokens) == 1`


Add general sanity tests for all tokenizers 2017-01-05 15:25:38 +00:00			`def test_tokenizer_handles_long_text(tokenizer):`
			`text = """Lorem ipsum dolor sit amet, consectetur adipiscing elit`

			`Cras egestas orci non porttitor maximus.`
			`Maecenas quis odio id dolor rhoncus dignissim. Curabitur sed velit at orci ultrices sagittis. Nulla commodo euismod arcu eget vulputate.`

			`Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, non lacinia enim nibh eget ipsum. Vestibulum in bibendum mauris.`

			`"Nullam porta fringilla enim, a dictum orci consequat in." Mauris nec malesuada justo."""`

			`tokens = tokenizer(text)`
			`assert len(tokens) > 5`


Move non-English-specific tests back to general tokenizer tests 2017-01-05 17:09:29 +00:00			`@pytest.mark.parametrize('file_name', ["sun.txt"])`
			`def test_tokenizer_handle_text_from_file(tokenizer, file_name):`
💫 Refactor test suite (#2568) ## Description Related issues: #2379 (should be fixed by separating model tests) * total execution time down from > 300 seconds to under 60 seconds 🎉 * removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure * changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version) * merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways) * tidied up and rewrote existing tests wherever possible ### Todo - [ ] move tests to `/tests` and adjust CI commands accordingly - [x] move model test suite from internal repo to `spacy-models` - [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~ - [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted - [ ] update documentation on how to run tests ### Types of change enhancement, tests ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information. 2018-07-24 21:38:44 +00:00			`loc = ensure_path(__file__).parent / file_name`
Remove unused utf8open util and replace os.path with ensure_path 2017-04-16 17:51:29 +00:00			`text = loc.open('r', encoding='utf8').read()`
Move non-English-specific tests back to general tokenizer tests 2017-01-05 17:09:29 +00:00			`assert len(text) != 0`
			`tokens = tokenizer(text)`
			`assert len(tokens) > 100`


Add general sanity tests for all tokenizers 2017-01-05 15:25:38 +00:00			`def test_tokenizer_suspected_freeing_strings(tokenizer):`
			`text1 = "Lorem dolor sit amet, consectetur adipiscing elit."`
			`text2 = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."`
			`tokens1 = tokenizer(text1)`
			`tokens2 = tokenizer(text2)`
			`assert tokens1[0].text == "Lorem"`
			`assert tokens2[0].text == "Lorem"`
Merge tokenizer tests 2017-01-13 00:34:14 +00:00

			`@pytest.mark.parametrize('text,tokens', [`
			`("lorem", [{'orth': 'lo'}, {'orth': 'rem'}])])`
			`def test_tokenizer_add_special_case(tokenizer, text, tokens):`
			`tokenizer.add_special_case(text, tokens)`
			`doc = tokenizer(text)`
			`assert doc[0].text == tokens[0]['orth']`
			`assert doc[1].text == tokens[1]['orth']`


			`@pytest.mark.parametrize('text,tokens', [`
			`("lorem", [{'orth': 'lo', 'tag': 'NN'}, {'orth': 'rem'}])])`
			`def test_tokenizer_add_special_case_tag(text, tokens):`
			`vocab = Vocab(tag_map={'NN': {'pos': 'NOUN'}})`
			`tokenizer = Tokenizer(vocab, {}, None, None, None)`
			`tokenizer.add_special_case(text, tokens)`
			`doc = tokenizer(text)`
			`assert doc[0].text == tokens[0]['orth']`
			`assert doc[0].tag_ == tokens[0]['tag']`
			`assert doc[0].pos_ == 'NOUN'`
			`assert doc[1].text == tokens[1]['orth']`