spaCy/spacy/tests/tokenizer/test_tokenizer.py

# coding: utf-8
from __future__ import unicode_literals

import pytest
from spacy.vocab import Vocab
from spacy.tokenizer import Tokenizer
from spacy.util import ensure_path


def test_tokenizer_handles_no_word(tokenizer):
    tokens = tokenizer("")
    assert len(tokens) == 0


@pytest.mark.parametrize("text", ["lorem"])
def test_tokenizer_handles_single_word(tokenizer, text):
    tokens = tokenizer(text)
    assert tokens[0].text == text


def test_tokenizer_handles_punct(tokenizer):
    text = "Lorem, ipsum."
    tokens = tokenizer(text)
    assert len(tokens) == 4
    assert tokens[0].text == "Lorem"
    assert tokens[1].text == ","
    assert tokens[2].text == "ipsum"
    assert tokens[1].text != "Lorem"


def test_tokenizer_handles_punct_braces(tokenizer):
    text = "Lorem, (ipsum)."
    tokens = tokenizer(text)
    assert len(tokens) == 6


def test_tokenizer_handles_digits(tokenizer):
    exceptions = ["hu", "bn"]
    text = "Lorem ipsum: 1984."
    tokens = tokenizer(text)

    if tokens[0].lang_ not in exceptions:
        assert len(tokens) == 5
        assert tokens[0].text == "Lorem"
        assert tokens[3].text == "1984"


@pytest.mark.parametrize(
    "text",
    ["google.com", "python.org", "spacy.io", "explosion.ai", "http://www.google.com"],
)
def test_tokenizer_keep_urls(tokenizer, text):
    tokens = tokenizer(text)
    assert len(tokens) == 1


@pytest.mark.parametrize("text", ["NASDAQ:GOOG"])
def test_tokenizer_colons(tokenizer, text):
    tokens = tokenizer(text)
    assert len(tokens) == 3


@pytest.mark.parametrize(
    "text", ["hello123@example.com", "hi+there@gmail.it", "matt@explosion.ai"]
)
def test_tokenizer_keeps_email(tokenizer, text):
    tokens = tokenizer(text)
    assert len(tokens) == 1


def test_tokenizer_handles_long_text(tokenizer):
    text = """Lorem ipsum dolor sit amet, consectetur adipiscing elit

Cras egestas orci non porttitor maximus.
Maecenas quis odio id dolor rhoncus dignissim. Curabitur sed velit at orci ultrices sagittis. Nulla commodo euismod arcu eget vulputate.

Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, non lacinia enim nibh eget ipsum. Vestibulum in bibendum mauris.

"Nullam porta fringilla enim, a dictum orci consequat in." Mauris nec malesuada justo."""

    tokens = tokenizer(text)
    assert len(tokens) > 5


@pytest.mark.parametrize("file_name", ["sun.txt"])
def test_tokenizer_handle_text_from_file(tokenizer, file_name):
    loc = ensure_path(__file__).parent / file_name
    text = loc.open("r", encoding="utf8").read()
    assert len(text) != 0
    tokens = tokenizer(text)
    assert len(tokens) > 100


def test_tokenizer_suspected_freeing_strings(tokenizer):
    text1 = "Lorem dolor sit amet, consectetur adipiscing elit."
    text2 = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
    tokens1 = tokenizer(text1)
    tokens2 = tokenizer(text2)
    assert tokens1[0].text == "Lorem"
    assert tokens2[0].text == "Lorem"


@pytest.mark.parametrize("text,tokens", [("lorem", [{"orth": "lo"}, {"orth": "rem"}])])
def test_tokenizer_add_special_case(tokenizer, text, tokens):
    tokenizer.add_special_case(text, tokens)
    doc = tokenizer(text)
    assert doc[0].text == tokens[0]["orth"]
    assert doc[1].text == tokens[1]["orth"]


@pytest.mark.parametrize(
    "text,tokens", [("lorem", [{"orth": "lo", "tag": "NN"}, {"orth": "rem"}])]
)
def test_tokenizer_add_special_case_tag(text, tokens):
    vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}})
    tokenizer = Tokenizer(vocab, {}, None, None, None)
    tokenizer.add_special_case(text, tokens)
    doc = tokenizer(text)
    assert doc[0].text == tokens[0]["orth"]
    assert doc[0].tag_ == tokens[0]["tag"]
    assert doc[0].pos_ == "NOUN"
    assert doc[1].text == tokens[1]["orth"]
Add general sanity tests for all tokenizers 2017-01-05 15:25:38 +00:00			`# coding: utf-8`
			`from __future__ import unicode_literals`

Merge tokenizer tests 2017-01-13 00:34:14 +00:00			`import pytest`
💫 Refactor test suite (#2568) ## Description Related issues: #2379 (should be fixed by separating model tests) * total execution time down from > 300 seconds to under 60 seconds 🎉 * removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure * changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version) * merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways) * tidied up and rewrote existing tests wherever possible ### Todo - [ ] move tests to `/tests` and adjust CI commands accordingly - [x] move model test suite from internal repo to `spacy-models` - [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~ - [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted - [ ] update documentation on how to run tests ### Types of change enhancement, tests ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information. 2018-07-24 21:38:44 +00:00			`from spacy.vocab import Vocab`
			`from spacy.tokenizer import Tokenizer`
			`from spacy.util import ensure_path`
Merge tokenizer tests 2017-01-13 00:34:14 +00:00
Add general sanity tests for all tokenizers 2017-01-05 15:25:38 +00:00
			`def test_tokenizer_handles_no_word(tokenizer):`
			`tokens = tokenizer("")`
			`assert len(tokens) == 0`


💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 00:09:36 +00:00			`@pytest.mark.parametrize("text", ["lorem"])`
Add general sanity tests for all tokenizers 2017-01-05 15:25:38 +00:00			`def test_tokenizer_handles_single_word(tokenizer, text):`
			`tokens = tokenizer(text)`
			`assert tokens[0].text == text`


			`def test_tokenizer_handles_punct(tokenizer):`
			`text = "Lorem, ipsum."`
			`tokens = tokenizer(text)`
			`assert len(tokens) == 4`
			`assert tokens[0].text == "Lorem"`
			`assert tokens[1].text == ","`
			`assert tokens[2].text == "ipsum"`
			`assert tokens[1].text != "Lorem"`


Replacing regex library with re to increase tokenization speed (#3218) * replace unicode categories with raw list of code points * simplifying ranges * fixing variable length quotes * removing redundant regular expression * small cleanup of regexp notations * quotes and alpha as ranges instead of alterations * removed most regexp dependencies and features * exponential backtracking - unit tests * rewrote expression with pathological backtracking * disabling double hyphen tests for now * test additional variants of repeating punctuation * remove regex and redundant backslashes from load_reddit script * small typo fixes * disable double punctuation test for russian * clean up old comments * format block code * final cleanup * naming consistency * french strings as unicode for python 2 support * french regular expression case insensitive 2019-02-01 07:05:22 +00:00			`def test_tokenizer_handles_punct_braces(tokenizer):`
			`text = "Lorem, (ipsum)."`
			`tokens = tokenizer(text)`
			`assert len(tokens) == 6`


Add general sanity tests for all tokenizers 2017-01-05 15:25:38 +00:00			`def test_tokenizer_handles_digits(tokenizer):`
add tests for Bengali 2017-03-05 01:11:26 +00:00			`exceptions = ["hu", "bn"]`
Add general sanity tests for all tokenizers 2017-01-05 15:25:38 +00:00			`text = "Lorem ipsum: 1984."`
			`tokens = tokenizer(text)`

			`if tokens[0].lang_ not in exceptions:`
			`assert len(tokens) == 5`
			`assert tokens[0].text == "Lorem"`
			`assert tokens[3].text == "1984"`


💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 00:09:36 +00:00			`@pytest.mark.parametrize(`
			`"text",`
			`["google.com", "python.org", "spacy.io", "explosion.ai", "http://www.google.com"],`
			`)`
Move non-English-specific tests back to general tokenizer tests 2017-01-05 17:09:29 +00:00			`def test_tokenizer_keep_urls(tokenizer, text):`
			`tokens = tokenizer(text)`
			`assert len(tokens) == 1`


💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 00:09:36 +00:00			`@pytest.mark.parametrize("text", ["NASDAQ:GOOG"])`
Issue #840 - URL pattenr too broad 2017-03-04 22:13:11 +00:00			`def test_tokenizer_colons(tokenizer, text):`
			`tokens = tokenizer(text)`
			`assert len(tokens) == 3`


💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 00:09:36 +00:00			`@pytest.mark.parametrize(`
			`"text", ["hello123@example.com", "hi+there@gmail.it", "matt@explosion.ai"]`
			`)`
Move non-English-specific tests back to general tokenizer tests 2017-01-05 17:09:29 +00:00			`def test_tokenizer_keeps_email(tokenizer, text):`
			`tokens = tokenizer(text)`
			`assert len(tokens) == 1`


Add general sanity tests for all tokenizers 2017-01-05 15:25:38 +00:00			`def test_tokenizer_handles_long_text(tokenizer):`
			`text = """Lorem ipsum dolor sit amet, consectetur adipiscing elit`

			`Cras egestas orci non porttitor maximus.`
			`Maecenas quis odio id dolor rhoncus dignissim. Curabitur sed velit at orci ultrices sagittis. Nulla commodo euismod arcu eget vulputate.`

			`Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, non lacinia enim nibh eget ipsum. Vestibulum in bibendum mauris.`

			`"Nullam porta fringilla enim, a dictum orci consequat in." Mauris nec malesuada justo."""`

			`tokens = tokenizer(text)`
			`assert len(tokens) > 5`


💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 00:09:36 +00:00			`@pytest.mark.parametrize("file_name", ["sun.txt"])`
Move non-English-specific tests back to general tokenizer tests 2017-01-05 17:09:29 +00:00			`def test_tokenizer_handle_text_from_file(tokenizer, file_name):`
💫 Refactor test suite (#2568) ## Description Related issues: #2379 (should be fixed by separating model tests) * total execution time down from > 300 seconds to under 60 seconds 🎉 * removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure * changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version) * merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways) * tidied up and rewrote existing tests wherever possible ### Todo - [ ] move tests to `/tests` and adjust CI commands accordingly - [x] move model test suite from internal repo to `spacy-models` - [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~ - [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted - [ ] update documentation on how to run tests ### Types of change enhancement, tests ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information. 2018-07-24 21:38:44 +00:00			`loc = ensure_path(__file__).parent / file_name`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 00:09:36 +00:00			`text = loc.open("r", encoding="utf8").read()`
Move non-English-specific tests back to general tokenizer tests 2017-01-05 17:09:29 +00:00			`assert len(text) != 0`
			`tokens = tokenizer(text)`
			`assert len(tokens) > 100`


Add general sanity tests for all tokenizers 2017-01-05 15:25:38 +00:00			`def test_tokenizer_suspected_freeing_strings(tokenizer):`
			`text1 = "Lorem dolor sit amet, consectetur adipiscing elit."`
			`text2 = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."`
			`tokens1 = tokenizer(text1)`
			`tokens2 = tokenizer(text2)`
			`assert tokens1[0].text == "Lorem"`
			`assert tokens2[0].text == "Lorem"`
Merge tokenizer tests 2017-01-13 00:34:14 +00:00

💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 00:09:36 +00:00			`@pytest.mark.parametrize("text,tokens", [("lorem", [{"orth": "lo"}, {"orth": "rem"}])])`
Merge tokenizer tests 2017-01-13 00:34:14 +00:00			`def test_tokenizer_add_special_case(tokenizer, text, tokens):`
			`tokenizer.add_special_case(text, tokens)`
			`doc = tokenizer(text)`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 00:09:36 +00:00			`assert doc[0].text == tokens[0]["orth"]`
			`assert doc[1].text == tokens[1]["orth"]`
Merge tokenizer tests 2017-01-13 00:34:14 +00:00

💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 00:09:36 +00:00			`@pytest.mark.parametrize(`
			`"text,tokens", [("lorem", [{"orth": "lo", "tag": "NN"}, {"orth": "rem"}])]`
			`)`
Merge tokenizer tests 2017-01-13 00:34:14 +00:00			`def test_tokenizer_add_special_case_tag(text, tokens):`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 00:09:36 +00:00			`vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}})`
Merge tokenizer tests 2017-01-13 00:34:14 +00:00			`tokenizer = Tokenizer(vocab, {}, None, None, None)`
			`tokenizer.add_special_case(text, tokens)`
			`doc = tokenizer(text)`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 00:09:36 +00:00			`assert doc[0].text == tokens[0]["orth"]`
			`assert doc[0].tag_ == tokens[0]["tag"]`
			`assert doc[0].pos_ == "NOUN"`
			`assert doc[1].text == tokens[1]["orth"]`