spaCy/spacy/tests/regression/test_issue1001-1500.py

# coding: utf-8
from __future__ import unicode_literals

import pytest
import re
from spacy.tokens import Doc
from spacy.vocab import Vocab
from spacy.lang.en import English
from spacy.lang.lex_attrs import LEX_ATTRS
from spacy.matcher import Matcher
from spacy.tokenizer import Tokenizer
from spacy.lemmatizer import Lemmatizer
from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part


def test_issue1242():
    nlp = English()
    doc = nlp('')
    assert len(doc) == 0
    docs = list(nlp.pipe(['', 'hello']))
    assert len(docs[0]) == 0
    assert len(docs[1]) == 1


def test_issue1250():
    """Test cached special cases."""
    special_case = [{ORTH: 'reimbur', LEMMA: 'reimburse', POS: 'VERB'}]
    nlp = English()
    nlp.tokenizer.add_special_case('reimbur', special_case)
    lemmas = [w.lemma_ for w in nlp('reimbur, reimbur...')]
    assert lemmas == ['reimburse', ',', 'reimburse', '...']
    lemmas = [w.lemma_ for w in nlp('reimbur, reimbur...')]
    assert lemmas == ['reimburse', ',', 'reimburse', '...']


def test_issue1257():
    """Test that tokens compare correctly."""
    doc1 = Doc(Vocab(), words=['a', 'b', 'c'])
    doc2 = Doc(Vocab(), words=['a', 'c', 'e'])
    assert doc1[0] != doc2[0]
    assert not doc1[0] == doc2[0]


def test_issue1375():
    """Test that token.nbor() raises IndexError for out-of-bounds access."""
    doc = Doc(Vocab(), words=['0', '1', '2'])
    with pytest.raises(IndexError):
        assert doc[0].nbor(-1)
    assert doc[1].nbor(-1).text == '0'
    with pytest.raises(IndexError):
        assert doc[2].nbor(1)
    assert doc[1].nbor(1).text == '2'


def test_issue1387():
    tag_map = {'VBG': {POS: VERB, VerbForm_part: True}}
    index = {"verb": ("cope","cop")}
    exc = {"verb": {"coping": ("cope",)}}
    rules = {"verb": [["ing", ""]]}
    lemmatizer = Lemmatizer(index, exc, rules)
    vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
    doc = Doc(vocab, words=["coping"])
    doc[0].tag_ = 'VBG'
    assert doc[0].text == "coping"
    assert doc[0].lemma_ == "cope"


def test_issue1434():
    """Test matches occur when optional element at end of short doc."""
    pattern = [{'ORTH': 'Hello' }, {'IS_ALPHA': True, 'OP': '?'}]
    vocab = Vocab(lex_attr_getters=LEX_ATTRS)
    hello_world = Doc(vocab, words=['Hello', 'World'])
    hello = Doc(vocab, words=['Hello'])
    matcher = Matcher(vocab)
    matcher.add('MyMatcher', None, pattern)
    matches = matcher(hello_world)
    assert matches
    matches = matcher(hello)
    assert matches


@pytest.mark.parametrize('string,start,end', [
    ('a', 0, 1), ('a b', 0, 2), ('a c', 0, 1), ('a b c', 0, 2),
    ('a b b c', 0, 3), ('a b b', 0, 3),])
def test_issue1450(string, start, end):
    """Test matcher works when patterns end with * operator."""
    pattern = [{'ORTH': "a"}, {'ORTH': "b", 'OP': "*"}]
    matcher = Matcher(Vocab())
    matcher.add("TSTEND", None, pattern)
    doc = Doc(Vocab(), words=string.split())
    matches = matcher(doc)
    if start is None or end is None:
        assert matches == []
    assert matches[-1][1] == start
    assert matches[-1][2] == end


def test_issue1488():
    prefix_re = re.compile(r'''[\[\("']''')
    suffix_re = re.compile(r'''[\]\)"']''')
    infix_re = re.compile(r'''[-~\.]''')
    simple_url_re = re.compile(r'''^https?://''')

    def my_tokenizer(nlp):
        return Tokenizer(nlp.vocab, {},
                         prefix_search=prefix_re.search,
                         suffix_search=suffix_re.search,
                         infix_finditer=infix_re.finditer,
                         token_match=simple_url_re.match)

    nlp = English()
    nlp.tokenizer = my_tokenizer(nlp)
    doc = nlp("This is a test.")
    for token in doc:
        assert token.text


def test_issue1494():
    infix_re = re.compile(r'''[^a-z]''')
    test_cases = [('token 123test', ['token', '1', '2', '3', 'test']),
                  ('token 1test', ['token', '1test']),
                  ('hello...test', ['hello', '.', '.', '.', 'test'])]
    new_tokenizer = lambda nlp: Tokenizer(nlp.vocab, {}, infix_finditer=infix_re.finditer)
    nlp = English()
    nlp.tokenizer = new_tokenizer(nlp)
    for text, expected in test_cases:
        assert [token.text for token in nlp(text)] == expected
💫 Refactor test suite (#2568) ## Description Related issues: #2379 (should be fixed by separating model tests) * total execution time down from > 300 seconds to under 60 seconds 🎉 * removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure * changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version) * merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways) * tidied up and rewrote existing tests wherever possible ### Todo - [ ] move tests to `/tests` and adjust CI commands accordingly - [x] move model test suite from internal repo to `spacy-models` - [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~ - [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted - [ ] update documentation on how to run tests ### Types of change enhancement, tests ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information. 2018-07-24 21:38:44 +00:00			`# coding: utf-8`
			`from __future__ import unicode_literals`

			`import pytest`
			`import re`
			`from spacy.tokens import Doc`
			`from spacy.vocab import Vocab`
			`from spacy.lang.en import English`
			`from spacy.lang.lex_attrs import LEX_ATTRS`
			`from spacy.matcher import Matcher`
			`from spacy.tokenizer import Tokenizer`
			`from spacy.lemmatizer import Lemmatizer`
			`from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part`


			`def test_issue1242():`
			`nlp = English()`
			`doc = nlp('')`
			`assert len(doc) == 0`
			`docs = list(nlp.pipe(['', 'hello']))`
			`assert len(docs[0]) == 0`
			`assert len(docs[1]) == 1`


			`def test_issue1250():`
			`"""Test cached special cases."""`
			`special_case = [{ORTH: 'reimbur', LEMMA: 'reimburse', POS: 'VERB'}]`
			`nlp = English()`
			`nlp.tokenizer.add_special_case('reimbur', special_case)`
			`lemmas = [w.lemma_ for w in nlp('reimbur, reimbur...')]`
			`assert lemmas == ['reimburse', ',', 'reimburse', '...']`
			`lemmas = [w.lemma_ for w in nlp('reimbur, reimbur...')]`
			`assert lemmas == ['reimburse', ',', 'reimburse', '...']`


			`def test_issue1257():`
			`"""Test that tokens compare correctly."""`
			`doc1 = Doc(Vocab(), words=['a', 'b', 'c'])`
			`doc2 = Doc(Vocab(), words=['a', 'c', 'e'])`
			`assert doc1[0] != doc2[0]`
			`assert not doc1[0] == doc2[0]`


			`def test_issue1375():`
			`"""Test that token.nbor() raises IndexError for out-of-bounds access."""`
			`doc = Doc(Vocab(), words=['0', '1', '2'])`
			`with pytest.raises(IndexError):`
			`assert doc[0].nbor(-1)`
			`assert doc[1].nbor(-1).text == '0'`
			`with pytest.raises(IndexError):`
			`assert doc[2].nbor(1)`
			`assert doc[1].nbor(1).text == '2'`


			`def test_issue1387():`
			`tag_map = {'VBG': {POS: VERB, VerbForm_part: True}}`
			`index = {"verb": ("cope","cop")}`
			`exc = {"verb": {"coping": ("cope",)}}`
			`rules = {"verb": [["ing", ""]]}`
			`lemmatizer = Lemmatizer(index, exc, rules)`
			`vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)`
			`doc = Doc(vocab, words=["coping"])`
			`doc[0].tag_ = 'VBG'`
			`assert doc[0].text == "coping"`
			`assert doc[0].lemma_ == "cope"`


			`def test_issue1434():`
			`"""Test matches occur when optional element at end of short doc."""`
			`pattern = [{'ORTH': 'Hello' }, {'IS_ALPHA': True, 'OP': '?'}]`
			`vocab = Vocab(lex_attr_getters=LEX_ATTRS)`
			`hello_world = Doc(vocab, words=['Hello', 'World'])`
			`hello = Doc(vocab, words=['Hello'])`
			`matcher = Matcher(vocab)`
			`matcher.add('MyMatcher', None, pattern)`
			`matches = matcher(hello_world)`
			`assert matches`
			`matches = matcher(hello)`
			`assert matches`


			`@pytest.mark.parametrize('string,start,end', [`
			`('a', 0, 1), ('a b', 0, 2), ('a c', 0, 1), ('a b c', 0, 2),`
			`('a b b c', 0, 3), ('a b b', 0, 3),])`
			`def test_issue1450(string, start, end):`
			`"""Test matcher works when patterns end with * operator."""`
			`pattern = [{'ORTH': "a"}, {'ORTH': "b", 'OP': "*"}]`
			`matcher = Matcher(Vocab())`
			`matcher.add("TSTEND", None, pattern)`
			`doc = Doc(Vocab(), words=string.split())`
			`matches = matcher(doc)`
			`if start is None or end is None:`
			`assert matches == []`
			`assert matches[-1][1] == start`
			`assert matches[-1][2] == end`


			`def test_issue1488():`
			`prefix_re = re.compile(r'''[\[\("']''')`
			`suffix_re = re.compile(r'''[\]\)"']''')`
			`infix_re = re.compile(r'''[-~\.]''')`
			`simple_url_re = re.compile(r'''^https?://''')`

			`def my_tokenizer(nlp):`
			`return Tokenizer(nlp.vocab, {},`
			`prefix_search=prefix_re.search,`
			`suffix_search=suffix_re.search,`
			`infix_finditer=infix_re.finditer,`
			`token_match=simple_url_re.match)`

			`nlp = English()`
			`nlp.tokenizer = my_tokenizer(nlp)`
			`doc = nlp("This is a test.")`
			`for token in doc:`
			`assert token.text`


			`def test_issue1494():`
			`infix_re = re.compile(r'''[^a-z]''')`
			`test_cases = [('token 123test', ['token', '1', '2', '3', 'test']),`
			`('token 1test', ['token', '1test']),`
			`('hello...test', ['hello', '.', '.', '.', 'test'])]`
			`new_tokenizer = lambda nlp: Tokenizer(nlp.vocab, {}, infix_finditer=infix_re.finditer)`
			`nlp = English()`
			`nlp.tokenizer = new_tokenizer(nlp)`
			`for text, expected in test_cases:`
			`assert [token.text for token in nlp(text)] == expected`