spaCy/spacy/tests/regression/test_issue1-1000.py

# coding: utf-8
from __future__ import unicode_literals

import pytest
import random
from spacy.matcher import Matcher
from spacy.attrs import IS_PUNCT, ORTH, LOWER
from spacy.symbols import POS, VERB, VerbForm_inf
from spacy.vocab import Vocab
from spacy.language import Language
from spacy.lemmatizer import Lemmatizer
from spacy.tokens import Doc

from ..util import get_doc, make_tempdir


@pytest.mark.parametrize('patterns', [
    [[{'LOWER': 'celtics'}], [{'LOWER': 'boston'}, {'LOWER': 'celtics'}]],
    [[{'LOWER': 'boston'}, {'LOWER': 'celtics'}], [{'LOWER': 'celtics'}]]])
def test_issue118(en_tokenizer, patterns):
    """Test a bug that arose from having overlapping matches"""
    text = "how many points did lebron james score against the boston celtics last night"
    doc = en_tokenizer(text)
    ORG = doc.vocab.strings['ORG']
    matcher = Matcher(doc.vocab)
    matcher.add("BostonCeltics", None, *patterns)
    assert len(list(doc.ents)) == 0
    matches = [(ORG, start, end) for _, start, end in matcher(doc)]
    assert matches == [(ORG, 9, 11), (ORG, 10, 11)]
    doc.ents = matches[:1]
    ents = list(doc.ents)
    assert len(ents) == 1
    assert ents[0].label == ORG
    assert ents[0].start == 9
    assert ents[0].end == 11


@pytest.mark.parametrize('patterns', [
    [[{'LOWER': 'boston'}], [{'LOWER': 'boston'}, {'LOWER': 'celtics'}]],
    [[{'LOWER': 'boston'}, {'LOWER': 'celtics'}], [{'LOWER': 'boston'}]]])
def test_issue118_prefix_reorder(en_tokenizer, patterns):
    """Test a bug that arose from having overlapping matches"""
    text = "how many points did lebron james score against the boston celtics last night"
    doc = en_tokenizer(text)
    ORG = doc.vocab.strings['ORG']
    matcher = Matcher(doc.vocab)
    matcher.add('BostonCeltics', None, *patterns)
    assert len(list(doc.ents)) == 0
    matches = [(ORG, start, end) for _, start, end in matcher(doc)]
    doc.ents += tuple(matches)[1:]
    assert matches == [(ORG, 9, 10), (ORG, 9, 11)]
    ents = doc.ents
    assert len(ents) == 1
    assert ents[0].label == ORG
    assert ents[0].start == 9
    assert ents[0].end == 11


def test_issue242(en_tokenizer):
    """Test overlapping multi-word phrases."""
    text = "There are different food safety standards in different countries."
    patterns = [[{'LOWER': 'food'}, {'LOWER': 'safety'}],
                [{'LOWER': 'safety'}, {'LOWER': 'standards'}]]
    doc = en_tokenizer(text)
    matcher = Matcher(doc.vocab)
    matcher.add('FOOD', None, *patterns)

    matches = [(ent_type, start, end) for ent_type, start, end in matcher(doc)]
    doc.ents += tuple(matches)
    match1, match2 = matches
    assert match1[1] == 3
    assert match1[2] == 5
    assert match2[1] == 4
    assert match2[2] == 6


def test_issue309(en_tokenizer):
    """Test Issue #309: SBD fails on empty string"""
    tokens = en_tokenizer(" ")
    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=['ROOT'])
    doc.is_parsed = True
    assert len(doc) == 1
    sents = list(doc.sents)
    assert len(sents) == 1


def test_issue351(en_tokenizer):
    doc = en_tokenizer("   This is a cat.")
    assert doc[0].idx == 0
    assert len(doc[0]) == 3
    assert doc[1].idx == 3


def test_issue360(en_tokenizer):
    """Test tokenization of big ellipsis"""
    tokens = en_tokenizer('$45...............Asking')
    assert len(tokens) > 2


@pytest.mark.parametrize('text1,text2', [("cat", "dog")])
def test_issue361(en_vocab, text1, text2):
    """Test Issue #361: Equality of lexemes"""
    assert en_vocab[text1] == en_vocab[text1]
    assert en_vocab[text1] != en_vocab[text2]


def test_issue587(en_tokenizer):
    """Test that Matcher doesn't segfault on particular input"""
    doc = en_tokenizer('a b; c')
    matcher = Matcher(doc.vocab)
    matcher.add('TEST1', None, [{ORTH: 'a'}, {ORTH: 'b'}])
    matches = matcher(doc)
    assert len(matches) == 1
    matcher.add('TEST2', None, [{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'c'}])
    matches = matcher(doc)
    assert len(matches) == 2
    matcher.add('TEST3', None, [{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'd'}])
    matches = matcher(doc)
    assert len(matches) == 2


def test_issue588(en_vocab):
    matcher = Matcher(en_vocab)
    with pytest.raises(ValueError):
        matcher.add('TEST', None, [])


@pytest.mark.xfail
def test_issue589():
    vocab = Vocab()
    vocab.strings.set_frozen(True)
    doc = Doc(vocab, words=['whata'])


def test_issue590(en_vocab):
    """Test overlapping matches"""
    doc = Doc(en_vocab, words=['n', '=', '1', ';', 'a', ':', '5', '%'])
    matcher = Matcher(en_vocab)
    matcher.add('ab', None, [{'IS_ALPHA': True}, {'ORTH': ':'}, {'LIKE_NUM': True}, {'ORTH': '%'}])
    matcher.add('ab', None, [{'IS_ALPHA': True}, {'ORTH': '='}, {'LIKE_NUM': True}])
    matches = matcher(doc)
    assert len(matches) == 2


def test_issue595():
    """Test lemmatization of base forms"""
    words = ["Do", "n't", "feed", "the", "dog"]
    tag_map = {'VB': {POS: VERB, VerbForm_inf: True}}
    rules = {"verb": [["ed", "e"]]}
    lemmatizer = Lemmatizer({'verb': {}}, {'verb': {}}, rules)
    vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
    doc = Doc(vocab, words=words)
    doc[2].tag_ = 'VB'
    assert doc[2].text == 'feed'
    assert doc[2].lemma_ == 'feed'


def test_issue599(en_vocab):
    doc = Doc(en_vocab)
    doc.is_tagged = True
    doc.is_parsed = True
    doc2 = Doc(doc.vocab)
    doc2.from_bytes(doc.to_bytes())
    assert doc2.is_parsed


def test_issue600():
    vocab = Vocab(tag_map={'NN': {'pos': 'NOUN'}})
    doc = Doc(vocab, words=["hello"])
    doc[0].tag_ = 'NN'


def test_issue615(en_tokenizer):
    def merge_phrases(matcher, doc, i, matches):
        """Merge a phrase. We have to be careful here because we'll change the
        token indices. To avoid problems, merge all the phrases once we're called
        on the last match."""
        if i != len(matches)-1:
            return None
        spans = [(ent_id, ent_id, doc[start : end]) for ent_id, start, end in matches]
        for ent_id, label, span in spans:
            span.merge(tag='NNP' if label else span.root.tag_, lemma=span.text,
                label=label)
            doc.ents = doc.ents + ((label, span.start, span.end),)

    text = "The golf club is broken"
    pattern = [{'ORTH': "golf"}, {'ORTH': "club"}]
    label = "Sport_Equipment"
    doc = en_tokenizer(text)
    matcher = Matcher(doc.vocab)
    matcher.add(label, merge_phrases, pattern)
    match = matcher(doc)
    entities = list(doc.ents)
    assert entities != []
    assert entities[0].label != 0


@pytest.mark.parametrize('text,number', [("7am", "7"), ("11p.m.", "11")])
def test_issue736(en_tokenizer, text, number):
    """Test that times like "7am" are tokenized correctly and that numbers are
    converted to string."""
    tokens = en_tokenizer(text)
    assert len(tokens) == 2
    assert tokens[0].text == number


@pytest.mark.parametrize('text', ["3/4/2012", "01/12/1900"])
def test_issue740(en_tokenizer, text):
    """Test that dates are not split and kept as one token. This behaviour is
    currently inconsistent, since dates separated by hyphens are still split.
    This will be hard to prevent without causing clashes with numeric ranges."""
    tokens = en_tokenizer(text)
    assert len(tokens) == 1


def test_issue743():
    doc = Doc(Vocab(), ['hello', 'world'])
    token = doc[0]
    s = set([token])
    items = list(s)
    assert items[0] is token


@pytest.mark.parametrize('text', ["We were scared", "We Were Scared"])
def test_issue744(en_tokenizer, text):
    """Test that 'were' and 'Were' are excluded from the contractions
    generated by the English tokenizer exceptions."""
    tokens = en_tokenizer(text)
    assert len(tokens) == 3
    assert tokens[1].text.lower() == "were"


@pytest.mark.parametrize('text,is_num', [("one", True), ("ten", True),
                                         ("teneleven", False)])
def test_issue759(en_tokenizer, text, is_num):
    tokens = en_tokenizer(text)
    assert tokens[0].like_num == is_num


@pytest.mark.parametrize('text', ["Shell", "shell", "Shed", "shed"])
def test_issue775(en_tokenizer, text):
    """Test that 'Shell' and 'shell' are excluded from the contractions
    generated by the English tokenizer exceptions."""
    tokens = en_tokenizer(text)
    assert len(tokens) == 1
    assert tokens[0].text == text


@pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"])
def test_issue792(en_tokenizer, text):
    """Test for Issue #792: Trailing whitespace is removed after tokenization."""
    doc = en_tokenizer(text)
    assert ''.join([token.text_with_ws for token in doc]) == text


@pytest.mark.parametrize('text', ["This is a string", "This is a string\n"])
def test_control_issue792(en_tokenizer, text):
    """Test base case for Issue #792: Non-trailing whitespace"""
    doc = en_tokenizer(text)
    assert ''.join([token.text_with_ws for token in doc]) == text


@pytest.mark.parametrize('text,tokens', [
    ('"deserve,"--and', ['"', "deserve", ',"--', "and"]),
    ("exception;--exclusive", ["exception", ";--", "exclusive"]),
    ("day.--Is", ["day", ".--", "Is"]),
    ("refinement:--just", ["refinement", ":--", "just"]),
    ("memories?--To", ["memories", "?--", "To"]),
    ("Useful.=--Therefore", ["Useful", ".=--", "Therefore"]),
    ("=Hope.=--Pandora", ["=", "Hope", ".=--", "Pandora"])])
def test_issue801(en_tokenizer, text, tokens):
    """Test that special characters + hyphens are split correctly."""
    doc = en_tokenizer(text)
    assert len(doc) == len(tokens)
    assert [t.text for t in doc] == tokens


@pytest.mark.parametrize('text,expected_tokens', [
    ('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']),
    ('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar'])
])
def test_issue805(sv_tokenizer, text, expected_tokens):
    tokens = sv_tokenizer(text)
    token_list = [token.text for token in tokens if not token.is_space]
    assert expected_tokens == token_list


def test_issue850():
    """The variable-length pattern matches the succeeding token. Check we
    handle the ambiguity correctly."""
    vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
    matcher = Matcher(vocab)
    IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
    pattern = [{'LOWER': "bob"}, {'OP': '*', 'IS_ANY_TOKEN': True}, {'LOWER': 'frank'}]
    matcher.add('FarAway', None, pattern)
    doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank'])
    match = matcher(doc)
    assert len(match) == 1
    ent_id, start, end = match[0]
    assert start == 0
    assert end == 4


def test_issue850_basic():
    """Test Matcher matches with '*' operator and Boolean flag"""
    vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
    matcher = Matcher(vocab)
    IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
    pattern = [{'LOWER': "bob"}, {'OP': '*', 'LOWER': 'and'}, {'LOWER': 'frank'}]
    matcher.add('FarAway', None, pattern)
    doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank'])
    match = matcher(doc)
    assert len(match) == 1
    ent_id, start, end = match[0]
    assert start == 0
    assert end == 4


@pytest.mark.parametrize('text', ["au-delàs", "pair-programmâmes",
                                  "terra-formées", "σ-compacts"])
def test_issue852(fr_tokenizer, text):
    """Test that French tokenizer exceptions are imported correctly."""
    tokens = fr_tokenizer(text)
    assert len(tokens) == 1


@pytest.mark.parametrize('text', ["aaabbb@ccc.com\nThank you!",
                                  "aaabbb@ccc.com \nThank you!"])
def test_issue859(en_tokenizer, text):
    """Test that no extra space is added in doc.text method."""
    doc = en_tokenizer(text)
    assert doc.text == text


@pytest.mark.parametrize('text', ["Datum:2014-06-02\nDokument:76467"])
def test_issue886(en_tokenizer, text):
    """Test that token.idx matches the original text index for texts with newlines."""
    doc = en_tokenizer(text)
    for token in doc:
        assert len(token.text) == len(token.text_with_ws)
        assert text[token.idx] == token.text[0]


@pytest.mark.parametrize('text', ["want/need"])
def test_issue891(en_tokenizer, text):
    """Test that / infixes are split correctly."""
    tokens = en_tokenizer(text)
    assert len(tokens) == 3
    assert tokens[1].text == "/"


@pytest.mark.parametrize('text,tag,lemma', [
    ("anus", "NN", "anus"),
    ("princess", "NN", "princess"),
    ("inner", "JJ", "inner")
])
def test_issue912(en_vocab, text, tag, lemma):
    """Test base-forms are preserved."""
    doc = Doc(en_vocab, words=[text])
    doc[0].tag_ = tag
    assert doc[0].lemma_ == lemma


def test_issue957(en_tokenizer):
    """Test that spaCy doesn't hang on many periods."""
    # skip test if pytest-timeout is not installed
    timeout = pytest.importorskip('pytest-timeout')
    string = '0'
    for i in range(1, 100):
        string += '.%d' % i
    doc = en_tokenizer(string)


@pytest.mark.xfail
def test_issue999(train_data):
    """Test that adding entities and resuming training works passably OK.
    There are two issues here:
    1) We have to readd labels. This isn't very nice.
    2) There's no way to set the learning rate for the weight update, so we
        end up out-of-scale, causing it to learn too fast.
    """
    TRAIN_DATA = [
        ["hey", []],
        ["howdy", []],
        ["hey there", []],
        ["hello", []],
        ["hi", []],
        ["i'm looking for a place to eat", []],
        ["i'm looking for a place in the north of town", [[31,36,"LOCATION"]]],
        ["show me chinese restaurants", [[8,15,"CUISINE"]]],
        ["show me chines restaurants", [[8,14,"CUISINE"]]],
    ]

    nlp = Language()
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)
    for _, offsets in TRAIN_DATA:
        for start, end, label in offsets:
            ner.add_label(label)
    nlp.begin_training()
    ner.model.learn_rate = 0.001
    for itn in range(100):
        random.shuffle(TRAIN_DATA)
        for raw_text, entity_offsets in TRAIN_DATA:
            nlp.update([raw_text], [{'entities': entity_offsets}])

    with make_tempdir() as model_dir:
        nlp.to_disk(model_dir)
        nlp2 = Language().from_disk(model_dir)

    for raw_text, entity_offsets in TRAIN_DATA:
        doc = nlp2(raw_text)
        ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents}
        for start, end, label in entity_offsets:
            if (start, end) in ents:
                assert ents[(start, end)] == label
                break
        else:
            if entity_offsets:
                raise Exception(ents)
-												💫 Refactor test suite (#2568)

## Description

Related issues: #2379 (should be fixed by separating model tests)

* **total execution time down from > 300 seconds to under 60 seconds** 🎉
* removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure
* changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version)
* merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways)
* tidied up and rewrote existing tests wherever possible

### Todo

- [ ] move tests to `/tests` and adjust CI commands accordingly
- [x] move model test suite from internal repo to `spacy-models`
- [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~
- [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted
- [ ] update documentation on how to run tests


### Types of change
enhancement, tests

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-07-24 21:38:44 +00:00
+								# coding: utf-8
 								from __future__ import unicode_literals
 								import pytest
 								import random
 								from spacy.matcher import Matcher
 								from spacy.attrs import IS_PUNCT, ORTH, LOWER
 								from spacy.symbols import POS, VERB, VerbForm_inf
 								from spacy.vocab import Vocab
 								from spacy.language import Language
 								from spacy.lemmatizer import Lemmatizer
 								from spacy.tokens import Doc
 								from ..util import get_doc, make_tempdir
 								@pytest.mark.parametrize('patterns', [
 								    [[{'LOWER': 'celtics'}], [{'LOWER': 'boston'}, {'LOWER': 'celtics'}]],
 								    [[{'LOWER': 'boston'}, {'LOWER': 'celtics'}], [{'LOWER': 'celtics'}]]])
 								def test_issue118(en_tokenizer, patterns):
 								    """Test a bug that arose from having overlapping matches"""
 								    text = "how many points did lebron james score against the boston celtics last night"
 								    doc = en_tokenizer(text)
 								    ORG = doc.vocab.strings['ORG']
 								    matcher = Matcher(doc.vocab)
 								    matcher.add("BostonCeltics", None, *patterns)
 								    assert len(list(doc.ents)) == 0
 								    matches = [(ORG, start, end) for _, start, end in matcher(doc)]
 								    assert matches == [(ORG, 9, 11), (ORG, 10, 11)]
 								    doc.ents = matches[:1]
 								    ents = list(doc.ents)
 								    assert len(ents) == 1
 								    assert ents[0].label == ORG
 								    assert ents[0].start == 9
 								    assert ents[0].end == 11
 								@pytest.mark.parametrize('patterns', [
 								    [[{'LOWER': 'boston'}], [{'LOWER': 'boston'}, {'LOWER': 'celtics'}]],
 								    [[{'LOWER': 'boston'}, {'LOWER': 'celtics'}], [{'LOWER': 'boston'}]]])
 								def test_issue118_prefix_reorder(en_tokenizer, patterns):
 								    """Test a bug that arose from having overlapping matches"""
 								    text = "how many points did lebron james score against the boston celtics last night"
 								    doc = en_tokenizer(text)
 								    ORG = doc.vocab.strings['ORG']
 								    matcher = Matcher(doc.vocab)
 								    matcher.add('BostonCeltics', None, *patterns)
 								    assert len(list(doc.ents)) == 0
 								    matches = [(ORG, start, end) for _, start, end in matcher(doc)]
 								    doc.ents += tuple(matches)[1:]
 								    assert matches == [(ORG, 9, 10), (ORG, 9, 11)]
 								    ents = doc.ents
 								    assert len(ents) == 1
 								    assert ents[0].label == ORG
 								    assert ents[0].start == 9
 								    assert ents[0].end == 11
 								def test_issue242(en_tokenizer):
 								    """Test overlapping multi-word phrases."""
 								    text = "There are different food safety standards in different countries."
 								    patterns = [[{'LOWER': 'food'}, {'LOWER': 'safety'}],
 								                [{'LOWER': 'safety'}, {'LOWER': 'standards'}]]
 								    doc = en_tokenizer(text)
 								    matcher = Matcher(doc.vocab)
 								    matcher.add('FOOD', None, *patterns)
 								    matches = [(ent_type, start, end) for ent_type, start, end in matcher(doc)]
 								    doc.ents += tuple(matches)
 								    match1, match2 = matches
 								    assert match1[1] == 3
 								    assert match1[2] == 5
 								    assert match2[1] == 4
 								    assert match2[2] == 6
 								def test_issue309(en_tokenizer):
 								    """Test Issue #309: SBD fails on empty string"""
 								    tokens = en_tokenizer(" ")
 								    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=['ROOT'])
 								    doc.is_parsed = True
 								    assert len(doc) == 1
 								    sents = list(doc.sents)
 								    assert len(sents) == 1
 								def test_issue351(en_tokenizer):
 								    doc = en_tokenizer("   This is a cat.")
 								    assert doc[0].idx == 0
 								    assert len(doc[0]) == 3
 								    assert doc[1].idx == 3
 								def test_issue360(en_tokenizer):
 								    """Test tokenization of big ellipsis"""
 								    tokens = en_tokenizer('$45...............Asking')
 								    assert len(tokens) > 2
 								@pytest.mark.parametrize('text1,text2', [("cat", "dog")])
 								def test_issue361(en_vocab, text1, text2):
 								    """Test Issue #361: Equality of lexemes"""
 								    assert en_vocab[text1] == en_vocab[text1]
 								    assert en_vocab[text1] != en_vocab[text2]
 								def test_issue587(en_tokenizer):
 								    """Test that Matcher doesn't segfault on particular input"""
 								    doc = en_tokenizer('a b; c')
 								    matcher = Matcher(doc.vocab)
 								    matcher.add('TEST1', None, [{ORTH: 'a'}, {ORTH: 'b'}])
 								    matches = matcher(doc)
 								    assert len(matches) == 1
 								    matcher.add('TEST2', None, [{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'c'}])
 								    matches = matcher(doc)
 								    assert len(matches) == 2
 								    matcher.add('TEST3', None, [{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'd'}])
 								    matches = matcher(doc)
 								    assert len(matches) == 2
 								def test_issue588(en_vocab):
 								    matcher = Matcher(en_vocab)
 								    with pytest.raises(ValueError):
 								        matcher.add('TEST', None, [])
 								@pytest.mark.xfail
 								def test_issue589():
 								    vocab = Vocab()
 								    vocab.strings.set_frozen(True)
 								    doc = Doc(vocab, words=['whata'])
 								def test_issue590(en_vocab):
 								    """Test overlapping matches"""
 								    doc = Doc(en_vocab, words=['n', '=', '1', ';', 'a', ':', '5', '%'])
 								    matcher = Matcher(en_vocab)
 								    matcher.add('ab', None, [{'IS_ALPHA': True}, {'ORTH': ':'}, {'LIKE_NUM': True}, {'ORTH': '%'}])
 								    matcher.add('ab', None, [{'IS_ALPHA': True}, {'ORTH': '='}, {'LIKE_NUM': True}])
 								    matches = matcher(doc)
 								    assert len(matches) == 2
 								def test_issue595():
 								    """Test lemmatization of base forms"""
 								    words = ["Do", "n't", "feed", "the", "dog"]
 								    tag_map = {'VB': {POS: VERB, VerbForm_inf: True}}
 								    rules = {"verb": [["ed", "e"]]}
 								    lemmatizer = Lemmatizer({'verb': {}}, {'verb': {}}, rules)
 								    vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
 								    doc = Doc(vocab, words=words)
 								    doc[2].tag_ = 'VB'
 								    assert doc[2].text == 'feed'
 								    assert doc[2].lemma_ == 'feed'
 								def test_issue599(en_vocab):
 								    doc = Doc(en_vocab)
 								    doc.is_tagged = True
 								    doc.is_parsed = True
 								    doc2 = Doc(doc.vocab)
 								    doc2.from_bytes(doc.to_bytes())
 								    assert doc2.is_parsed
 								def test_issue600():
 								    vocab = Vocab(tag_map={'NN': {'pos': 'NOUN'}})
 								    doc = Doc(vocab, words=["hello"])
 								    doc[0].tag_ = 'NN'
 								def test_issue615(en_tokenizer):
 								    def merge_phrases(matcher, doc, i, matches):
 								        """Merge a phrase. We have to be careful here because we'll change the
 								        token indices. To avoid problems, merge all the phrases once we're called
 								        on the last match."""
 								        if i != len(matches)-1:
 								            return None
 								        spans = [(ent_id, ent_id, doc[start : end]) for ent_id, start, end in matches]
 								        for ent_id, label, span in spans:
 								            span.merge(tag='NNP' if label else span.root.tag_, lemma=span.text,
 								                label=label)
 								            doc.ents = doc.ents + ((label, span.start, span.end),)
 								    text = "The golf club is broken"
 								    pattern = [{'ORTH': "golf"}, {'ORTH': "club"}]
 								    label = "Sport_Equipment"
 								    doc = en_tokenizer(text)
 								    matcher = Matcher(doc.vocab)
 								    matcher.add(label, merge_phrases, pattern)
 								    match = matcher(doc)
 								    entities = list(doc.ents)
 								    assert entities != []
 								    assert entities[0].label != 0
 								@pytest.mark.parametrize('text,number', [("7am", "7"), ("11p.m.", "11")])
 								def test_issue736(en_tokenizer, text, number):
 								    """Test that times like "7am" are tokenized correctly and that numbers are
 								    converted to string."""
 								    tokens = en_tokenizer(text)
 								    assert len(tokens) == 2
 								    assert tokens[0].text == number
 								@pytest.mark.parametrize('text', ["3/4/2012", "01/12/1900"])
 								def test_issue740(en_tokenizer, text):
 								    """Test that dates are not split and kept as one token. This behaviour is
 								    currently inconsistent, since dates separated by hyphens are still split.
 								    This will be hard to prevent without causing clashes with numeric ranges."""
 								    tokens = en_tokenizer(text)
 								    assert len(tokens) == 1
 								def test_issue743():
 								    doc = Doc(Vocab(), ['hello', 'world'])
 								    token = doc[0]
 								    s = set([token])
 								    items = list(s)
 								    assert items[0] is token
 								@pytest.mark.parametrize('text', ["We were scared", "We Were Scared"])
 								def test_issue744(en_tokenizer, text):
 								    """Test that 'were' and 'Were' are excluded from the contractions
 								    generated by the English tokenizer exceptions."""
 								    tokens = en_tokenizer(text)
 								    assert len(tokens) == 3
 								    assert tokens[1].text.lower() == "were"
 								@pytest.mark.parametrize('text,is_num', [("one", True), ("ten", True),
 								                                         ("teneleven", False)])
 								def test_issue759(en_tokenizer, text, is_num):
 								    tokens = en_tokenizer(text)
 								    assert tokens[0].like_num == is_num
 								@pytest.mark.parametrize('text', ["Shell", "shell", "Shed", "shed"])
 								def test_issue775(en_tokenizer, text):
 								    """Test that 'Shell' and 'shell' are excluded from the contractions
 								    generated by the English tokenizer exceptions."""
 								    tokens = en_tokenizer(text)
 								    assert len(tokens) == 1
 								    assert tokens[0].text == text
 								@pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"])
 								def test_issue792(en_tokenizer, text):
 								    """Test for Issue #792: Trailing whitespace is removed after tokenization."""
 								    doc = en_tokenizer(text)
 								    assert ''.join([token.text_with_ws for token in doc]) == text
 								@pytest.mark.parametrize('text', ["This is a string", "This is a string\n"])
 								def test_control_issue792(en_tokenizer, text):
 								    """Test base case for Issue #792: Non-trailing whitespace"""
 								    doc = en_tokenizer(text)
 								    assert ''.join([token.text_with_ws for token in doc]) == text
 								@pytest.mark.parametrize('text,tokens', [
 								    ('"deserve,"--and', ['"', "deserve", ',"--', "and"]),
 								    ("exception;--exclusive", ["exception", ";--", "exclusive"]),
 								    ("day.--Is", ["day", ".--", "Is"]),
 								    ("refinement:--just", ["refinement", ":--", "just"]),
 								    ("memories?--To", ["memories", "?--", "To"]),
 								    ("Useful.=--Therefore", ["Useful", ".=--", "Therefore"]),
 								    ("=Hope.=--Pandora", ["=", "Hope", ".=--", "Pandora"])])
 								def test_issue801(en_tokenizer, text, tokens):
 								    """Test that special characters + hyphens are split correctly."""
 								    doc = en_tokenizer(text)
 								    assert len(doc) == len(tokens)
 								    assert [t.text for t in doc] == tokens
 								@pytest.mark.parametrize('text,expected_tokens', [
 								    ('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']),
 								    ('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar'])
 								])
 								def test_issue805(sv_tokenizer, text, expected_tokens):
 								    tokens = sv_tokenizer(text)
 								    token_list = [token.text for token in tokens if not token.is_space]
 								    assert expected_tokens == token_list
 								def test_issue850():
 								    """The variable-length pattern matches the succeeding token. Check we
 								    handle the ambiguity correctly."""
 								    vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
 								    matcher = Matcher(vocab)
 								    IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
 								    pattern = [{'LOWER': "bob"}, {'OP': '*', 'IS_ANY_TOKEN': True}, {'LOWER': 'frank'}]
 								    matcher.add('FarAway', None, pattern)
 								    doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank'])
 								    match = matcher(doc)
 								    assert len(match) == 1
 								    ent_id, start, end = match[0]
 								    assert start == 0
 								    assert end == 4
 								def test_issue850_basic():
 								    """Test Matcher matches with '*' operator and Boolean flag"""
 								    vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
 								    matcher = Matcher(vocab)
 								    IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
 								    pattern = [{'LOWER': "bob"}, {'OP': '*', 'LOWER': 'and'}, {'LOWER': 'frank'}]
 								    matcher.add('FarAway', None, pattern)
 								    doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank'])
 								    match = matcher(doc)
 								    assert len(match) == 1
 								    ent_id, start, end = match[0]
 								    assert start == 0
 								    assert end == 4
 								@pytest.mark.parametrize('text', ["au-delàs", "pair-programmâmes",
 								                                  "terra-formées", "σ-compacts"])
 								def test_issue852(fr_tokenizer, text):
 								    """Test that French tokenizer exceptions are imported correctly."""
 								    tokens = fr_tokenizer(text)
 								    assert len(tokens) == 1
 								@pytest.mark.parametrize('text', ["aaabbb@ccc.com\nThank you!",
 								                                  "aaabbb@ccc.com \nThank you!"])
 								def test_issue859(en_tokenizer, text):
 								    """Test that no extra space is added in doc.text method."""
 								    doc = en_tokenizer(text)
 								    assert doc.text == text
 								@pytest.mark.parametrize('text', ["Datum:2014-06-02\nDokument:76467"])
 								def test_issue886(en_tokenizer, text):
 								    """Test that token.idx matches the original text index for texts with newlines."""
 								    doc = en_tokenizer(text)
 								    for token in doc:
 								        assert len(token.text) == len(token.text_with_ws)
 								        assert text[token.idx] == token.text[0]
 								@pytest.mark.parametrize('text', ["want/need"])
 								def test_issue891(en_tokenizer, text):
 								    """Test that / infixes are split correctly."""
 								    tokens = en_tokenizer(text)
 								    assert len(tokens) == 3
 								    assert tokens[1].text == "/"
 								@pytest.mark.parametrize('text,tag,lemma', [
 								    ("anus", "NN", "anus"),
 								    ("princess", "NN", "princess"),
 								    ("inner", "JJ", "inner")
 								])
 								def test_issue912(en_vocab, text, tag, lemma):
 								    """Test base-forms are preserved."""
 								    doc = Doc(en_vocab, words=[text])
 								    doc[0].tag_ = tag
 								    assert doc[0].lemma_ == lemma
 								def test_issue957(en_tokenizer):
 								    """Test that spaCy doesn't hang on many periods."""
 								    # skip test if pytest-timeout is not installed
 								    timeout = pytest.importorskip('pytest-timeout')
 								    string = '0'
 								    for i in range(1, 100):
 								        string += '.%d' % i
 								    doc = en_tokenizer(string)
 								@pytest.mark.xfail
 								def test_issue999(train_data):
 								    """Test that adding entities and resuming training works passably OK.
 								    There are two issues here:
 ) We have to readd labels. This isn't very nice.
 ) There's no way to set the learning rate for the weight update, so we
 								        end up out-of-scale, causing it to learn too fast.
 								    """
 								    TRAIN_DATA = [
 								        ["hey", []],
 								        ["howdy", []],
 								        ["hey there", []],
 								        ["hello", []],
 								        ["hi", []],
 								        ["i'm looking for a place to eat", []],
 								        ["i'm looking for a place in the north of town", [[31,36,"LOCATION"]]],
 								        ["show me chinese restaurants", [[8,15,"CUISINE"]]],
 								        ["show me chines restaurants", [[8,14,"CUISINE"]]],
 								    ]
 								    nlp = Language()
 								    ner = nlp.create_pipe('ner')
 								    nlp.add_pipe(ner)
 								    for _, offsets in TRAIN_DATA:
 								        for start, end, label in offsets:
 								            ner.add_label(label)
 								    nlp.begin_training()
 								    ner.model.learn_rate = 0.001
 								    for itn in range(100):
 								        random.shuffle(TRAIN_DATA)
 								        for raw_text, entity_offsets in TRAIN_DATA:
 								            nlp.update([raw_text], [{'entities': entity_offsets}])
 								    with make_tempdir() as model_dir:
 								        nlp.to_disk(model_dir)
 								        nlp2 = Language().from_disk(model_dir)
 								    for raw_text, entity_offsets in TRAIN_DATA:
 								        doc = nlp2(raw_text)
 								        ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents}
 								        for start, end, label in entity_offsets:
 								            if (start, end) in ents:
 								                assert ents[(start, end)] == label
 								                break
 								        else:
 								            if entity_offsets:
 								                raise Exception(ents)