spaCy/spacy/tests/regression/test_issue3001-3500.py

import pytest
from spacy.lang.en import English
from spacy.lang.de import German
from spacy.pipeline import EntityRuler, EntityRecognizer
from spacy.matcher import Matcher, PhraseMatcher
from spacy.tokens import Doc
from spacy.vocab import Vocab
from spacy.attrs import ENT_IOB, ENT_TYPE
from spacy.compat import pickle
from spacy import displacy
from spacy.util import decaying
import numpy

from spacy.vectors import Vectors
from ..util import get_doc


def test_issue3002():
    """Test that the tokenizer doesn't hang on a long list of dots"""
    nlp = German()
    doc = nlp(
        "880.794.982.218.444.893.023.439.794.626.120.190.780.624.990.275.671 ist eine lange Zahl"
    )
    assert len(doc) == 5


def test_issue3009(en_vocab):
    """Test problem with matcher quantifiers"""
    patterns = [
        [{"LEMMA": "have"}, {"LOWER": "to"}, {"LOWER": "do"}, {"TAG": "IN"}],
        [
            {"LEMMA": "have"},
            {"IS_ASCII": True, "IS_PUNCT": False, "OP": "*"},
            {"LOWER": "to"},
            {"LOWER": "do"},
            {"TAG": "IN"},
        ],
        [
            {"LEMMA": "have"},
            {"IS_ASCII": True, "IS_PUNCT": False, "OP": "?"},
            {"LOWER": "to"},
            {"LOWER": "do"},
            {"TAG": "IN"},
        ],
    ]
    words = ["also", "has", "to", "do", "with"]
    tags = ["RB", "VBZ", "TO", "VB", "IN"]
    doc = get_doc(en_vocab, words=words, tags=tags)
    matcher = Matcher(en_vocab)
    for i, pattern in enumerate(patterns):
        matcher.add(str(i), [pattern])
        matches = matcher(doc)
        assert matches


def test_issue3012(en_vocab):
    """Test that the is_tagged attribute doesn't get overwritten when we from_array
    without tag information."""
    words = ["This", "is", "10", "%", "."]
    tags = ["DT", "VBZ", "CD", "NN", "."]
    pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
    ents = [(2, 4, "PERCENT")]
    doc = get_doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
    assert doc.is_tagged

    expected = ("10", "NUM", "CD", "PERCENT")
    assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected

    header = [ENT_IOB, ENT_TYPE]
    ent_array = doc.to_array(header)
    doc.from_array(header, ent_array)

    assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected

    # Serializing then deserializing
    doc_bytes = doc.to_bytes()
    doc2 = Doc(en_vocab).from_bytes(doc_bytes)
    assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected


def test_issue3199():
    """Test that Span.noun_chunks works correctly if no noun chunks iterator
    is available. To make this test future-proof, we're constructing a Doc
    with a new Vocab here and setting is_parsed to make sure the noun chunks run.
    """
    doc = Doc(Vocab(), words=["This", "is", "a", "sentence"])
    doc.is_parsed = True
    assert list(doc[0:3].noun_chunks) == []


def test_issue3209():
    """Test issue that occurred in spaCy nightly where NER labels were being
    mapped to classes incorrectly after loading the model, when the labels
    were added using ner.add_label().
    """
    nlp = English()
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)

    ner.add_label("ANIMAL")
    nlp.begin_training()
    move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"]
    assert ner.move_names == move_names
    nlp2 = English()
    nlp2.add_pipe(nlp2.create_pipe("ner"))
    nlp2.from_bytes(nlp.to_bytes())
    assert nlp2.get_pipe("ner").move_names == move_names


def test_issue3248_1():
    """Test that the PhraseMatcher correctly reports its number of rules, not
    total number of patterns."""
    nlp = English()
    matcher = PhraseMatcher(nlp.vocab)
    matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
    matcher.add("TEST2", [nlp("d")])
    assert len(matcher) == 2


def test_issue3248_2():
    """Test that the PhraseMatcher can be pickled correctly."""
    nlp = English()
    matcher = PhraseMatcher(nlp.vocab)
    matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
    matcher.add("TEST2", [nlp("d")])
    data = pickle.dumps(matcher)
    new_matcher = pickle.loads(data)
    assert len(new_matcher) == len(matcher)


def test_issue3277(es_tokenizer):
    """Test that hyphens are split correctly as prefixes."""
    doc = es_tokenizer("—Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.")
    assert len(doc) == 14
    assert doc[0].text == "\u2014"
    assert doc[5].text == "\u2013"
    assert doc[9].text == "\u2013"


def test_issue3288(en_vocab):
    """Test that retokenization works correctly via displaCy when punctuation
    is merged onto the preceeding token and tensor is resized."""
    words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"]
    heads = [1, 0, -1, 1, 0, 1, -2, -3]
    deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"]
    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
    doc.tensor = numpy.zeros((len(words), 96), dtype="float32")
    displacy.render(doc)


def test_issue3289():
    """Test that Language.to_bytes handles serializing a pipeline component
    with an uninitialized model."""
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("textcat"))
    bytes_data = nlp.to_bytes()
    new_nlp = English()
    new_nlp.add_pipe(nlp.create_pipe("textcat"))
    new_nlp.from_bytes(bytes_data)


def test_issue3328(en_vocab):
    doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"])
    matcher = Matcher(en_vocab)
    patterns = [
        [{"LOWER": {"IN": ["hello", "how"]}}],
        [{"LOWER": {"IN": ["you", "doing"]}}],
    ]
    matcher.add("TEST", patterns)
    matches = matcher(doc)
    assert len(matches) == 4
    matched_texts = [doc[start:end].text for _, start, end in matches]
    assert matched_texts == ["Hello", "how", "you", "doing"]


def test_issue3331(en_vocab):
    """Test that duplicate patterns for different rules result in multiple
    matches, one per rule.
    """
    matcher = PhraseMatcher(en_vocab)
    matcher.add("A", [Doc(en_vocab, words=["Barack", "Obama"])])
    matcher.add("B", [Doc(en_vocab, words=["Barack", "Obama"])])
    doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"])
    matches = matcher(doc)
    assert len(matches) == 2
    match_ids = [en_vocab.strings[matches[0][0]], en_vocab.strings[matches[1][0]]]
    assert sorted(match_ids) == ["A", "B"]


def test_issue3345():
    """Test case where preset entity crosses sentence boundary."""
    nlp = English()
    doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
    doc[4].is_sent_start = True
    ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
    ner = EntityRecognizer(doc.vocab)
    # Add the OUT action. I wouldn't have thought this would be necessary...
    ner.moves.add_action(5, "")
    ner.add_label("GPE")
    doc = ruler(doc)
    # Get into the state just before "New"
    state = ner.moves.init_batch([doc])[0]
    ner.moves.apply_transition(state, "O")
    ner.moves.apply_transition(state, "O")
    ner.moves.apply_transition(state, "O")
    # Check that B-GPE is valid.
    assert ner.moves.is_valid(state, "B-GPE")


def test_issue3410():
    texts = ["Hello world", "This is a test"]
    nlp = English()
    matcher = Matcher(nlp.vocab)
    phrasematcher = PhraseMatcher(nlp.vocab)
    with pytest.deprecated_call():
        docs = list(nlp.pipe(texts, n_threads=4))
    with pytest.deprecated_call():
        docs = list(nlp.tokenizer.pipe(texts, n_threads=4))
    with pytest.deprecated_call():
        list(matcher.pipe(docs, n_threads=4))
    with pytest.deprecated_call():
        list(phrasematcher.pipe(docs, n_threads=4))


def test_issue3412():
    data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f")
    vectors = Vectors(data=data)
    keys, best_rows, scores = vectors.most_similar(
        numpy.asarray([[9, 8, 7], [0, 0, 0]], dtype="f")
    )
    assert best_rows[0] == 2


def test_issue3447():
    sizes = decaying(10.0, 1.0, 0.5)
    size = next(sizes)
    assert size == 10.0
    size = next(sizes)
    assert size == 10.0 - 0.5
    size = next(sizes)
    assert size == 10.0 - 0.5 - 0.5


@pytest.mark.xfail(reason="default suffix rules avoid one upper-case letter before dot")
def test_issue3449():
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
    text1 = "He gave the ball to I. Do you want to go to the movies with I?"
    text2 = "He gave the ball to I.  Do you want to go to the movies with I?"
    text3 = "He gave the ball to I.\nDo you want to go to the movies with I?"
    t1 = nlp(text1)
    t2 = nlp(text2)
    t3 = nlp(text3)
    assert t1[5].text == "I"
    assert t2[5].text == "I"
    assert t3[5].text == "I"


@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue3456():
    # this crashed because of a padding error in layer.ops.unflatten in thinc
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("tagger"))
    nlp.begin_training()
    list(nlp.pipe(["hi", ""]))


def test_issue3468():
    """Test that sentence boundaries are set correctly so Doc.is_sentenced can
    be restored after serialization."""
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
    doc = nlp("Hello world")
    assert doc[0].is_sent_start
    assert doc.is_sentenced
    assert len(list(doc.sents)) == 1
    doc_bytes = doc.to_bytes()
    new_doc = Doc(nlp.vocab).from_bytes(doc_bytes)
    assert new_doc[0].is_sent_start
    assert new_doc.is_sentenced
    assert len(list(new_doc.sents)) == 1
-												Merge regression tests

											
										
										
											2019-07-10 10:49:18 +00:00
+								import pytest
 								from spacy.lang.en import English
 								from spacy.lang.de import German
 								from spacy.pipeline import EntityRuler, EntityRecognizer
 								from spacy.matcher import Matcher, PhraseMatcher
 								from spacy.tokens import Doc
 								from spacy.vocab import Vocab
 								from spacy.attrs import ENT_IOB, ENT_TYPE
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 00:53:56 +00:00
+								from spacy.compat import pickle
-												Merge regression tests

											
										
										
											2019-07-10 10:49:18 +00:00
+								from spacy import displacy
 								from spacy.util import decaying
 								import numpy
-												prevent division by zero in most_similar method (#4488)


											
										
										
											2019-10-21 10:04:46 +00:00
+								from spacy.vectors import Vectors
-												Merge regression tests

											
										
										
											2019-07-10 10:49:18 +00:00
+								from ..util import get_doc
 								def test_issue3002():
 								    """Test that the tokenizer doesn't hang on a long list of dots"""
 								    nlp = German()
 								    doc = nlp(
 								        "880.794.982.218.444.893.023.439.794.626.120.190.780.624.990.275.671 ist eine lange Zahl"
 								    )
 								    assert len(doc) == 5
 								def test_issue3009(en_vocab):
 								    """Test problem with matcher quantifiers"""
 								    patterns = [
-												Make regression test less sensitive to tag-map stuff

											
										
										
											2019-08-25 19:54:26 +00:00
+								        [{"LEMMA": "have"}, {"LOWER": "to"}, {"LOWER": "do"}, {"TAG": "IN"}],
-												Merge regression tests

											
										
										
											2019-07-10 10:49:18 +00:00
+								        [
 								            {"LEMMA": "have"},
 								            {"IS_ASCII": True, "IS_PUNCT": False, "OP": "*"},
 								            {"LOWER": "to"},
 								            {"LOWER": "do"},
-												Make regression test less sensitive to tag-map stuff

											
										
										
											2019-08-25 19:54:26 +00:00
+								            {"TAG": "IN"},
-												Merge regression tests

											
										
										
											2019-07-10 10:49:18 +00:00
+								        ],
 								        [
 								            {"LEMMA": "have"},
 								            {"IS_ASCII": True, "IS_PUNCT": False, "OP": "?"},
 								            {"LOWER": "to"},
 								            {"LOWER": "do"},
-												Make regression test less sensitive to tag-map stuff

											
										
										
											2019-08-25 19:54:26 +00:00
+								            {"TAG": "IN"},
-												Merge regression tests

											
										
										
											2019-07-10 10:49:18 +00:00
+								        ],
 								    ]
 								    words = ["also", "has", "to", "do", "with"]
 								    tags = ["RB", "VBZ", "TO", "VB", "IN"]
 								    doc = get_doc(en_vocab, words=words, tags=tags)
 								    matcher = Matcher(en_vocab)
 								    for i, pattern in enumerate(patterns):
-												Implement new API for {Phrase}Matcher.add (backwards-compatible) (#4522)

* Implement new API for {Phrase}Matcher.add (backwards-compatible)

* Update docs

* Also update DependencyMatcher.add

* Update internals

* Rewrite tests to use new API

* Add basic check for common mistake

Raise error with suggestion if user likely passed in a pattern instead of a list of patterns

* Fix typo [ci skip]

											
										
										
											2019-10-25 20:21:08 +00:00
+								        matcher.add(str(i), [pattern])
-												Merge regression tests

											
										
										
											2019-07-10 10:49:18 +00:00
+								        matches = matcher(doc)
 								        assert matches
 								def test_issue3012(en_vocab):
 								    """Test that the is_tagged attribute doesn't get overwritten when we from_array
 								    without tag information."""
 								    words = ["This", "is", "10", "%", "."]
 								    tags = ["DT", "VBZ", "CD", "NN", "."]
 								    pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
 								    ents = [(2, 4, "PERCENT")]
 								    doc = get_doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
 								    assert doc.is_tagged
 								    expected = ("10", "NUM", "CD", "PERCENT")
 								    assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
 								    header = [ENT_IOB, ENT_TYPE]
 								    ent_array = doc.to_array(header)
 								    doc.from_array(header, ent_array)
 								    assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
 								    # Serializing then deserializing
 								    doc_bytes = doc.to_bytes()
 								    doc2 = Doc(en_vocab).from_bytes(doc_bytes)
 								    assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected
 								def test_issue3199():
 								    """Test that Span.noun_chunks works correctly if no noun chunks iterator
 								    is available. To make this test future-proof, we're constructing a Doc
 								    with a new Vocab here and setting is_parsed to make sure the noun chunks run.
 								    """
 								    doc = Doc(Vocab(), words=["This", "is", "a", "sentence"])
 								    doc.is_parsed = True
 								    assert list(doc[0:3].noun_chunks) == []
 								def test_issue3209():
 								    """Test issue that occurred in spaCy nightly where NER labels were being
 								    mapped to classes incorrectly after loading the model, when the labels
 								    were added using ner.add_label().
 								    """
 								    nlp = English()
 								    ner = nlp.create_pipe("ner")
 								    nlp.add_pipe(ner)
 								    ner.add_label("ANIMAL")
 								    nlp.begin_training()
 								    move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"]
 								    assert ner.move_names == move_names
 								    nlp2 = English()
 								    nlp2.add_pipe(nlp2.create_pipe("ner"))
 								    nlp2.from_bytes(nlp.to_bytes())
 								    assert nlp2.get_pipe("ner").move_names == move_names
 								def test_issue3248_1():
 								    """Test that the PhraseMatcher correctly reports its number of rules, not
 								    total number of patterns."""
 								    nlp = English()
 								    matcher = PhraseMatcher(nlp.vocab)
-												Implement new API for {Phrase}Matcher.add (backwards-compatible) (#4522)

* Implement new API for {Phrase}Matcher.add (backwards-compatible)

* Update docs

* Also update DependencyMatcher.add

* Update internals

* Rewrite tests to use new API

* Add basic check for common mistake

Raise error with suggestion if user likely passed in a pattern instead of a list of patterns

* Fix typo [ci skip]

											
										
										
											2019-10-25 20:21:08 +00:00
+								    matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
 								    matcher.add("TEST2", [nlp("d")])
-												Merge regression tests

											
										
										
											2019-07-10 10:49:18 +00:00
+								    assert len(matcher) == 2
 								def test_issue3248_2():
 								    """Test that the PhraseMatcher can be pickled correctly."""
 								    nlp = English()
 								    matcher = PhraseMatcher(nlp.vocab)
-												Implement new API for {Phrase}Matcher.add (backwards-compatible) (#4522)

* Implement new API for {Phrase}Matcher.add (backwards-compatible)

* Update docs

* Also update DependencyMatcher.add

* Update internals

* Rewrite tests to use new API

* Add basic check for common mistake

Raise error with suggestion if user likely passed in a pattern instead of a list of patterns

* Fix typo [ci skip]

											
										
										
											2019-10-25 20:21:08 +00:00
+								    matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
 								    matcher.add("TEST2", [nlp("d")])
-												Merge regression tests

											
										
										
											2019-07-10 10:49:18 +00:00
+								    data = pickle.dumps(matcher)
 								    new_matcher = pickle.loads(data)
 								    assert len(new_matcher) == len(matcher)
 								def test_issue3277(es_tokenizer):
 								    """Test that hyphens are split correctly as prefixes."""
 								    doc = es_tokenizer("—Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.")
 								    assert len(doc) == 14
 								    assert doc[0].text == "\u2014"
 								    assert doc[5].text == "\u2013"
 								    assert doc[9].text == "\u2013"
 								def test_issue3288(en_vocab):
 								    """Test that retokenization works correctly via displaCy when punctuation
 								    is merged onto the preceeding token and tensor is resized."""
 								    words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"]
 								    heads = [1, 0, -1, 1, 0, 1, -2, -3]
 								    deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"]
 								    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
 								    doc.tensor = numpy.zeros((len(words), 96), dtype="float32")
 								    displacy.render(doc)
 								def test_issue3289():
 								    """Test that Language.to_bytes handles serializing a pipeline component
 								    with an uninitialized model."""
 								    nlp = English()
 								    nlp.add_pipe(nlp.create_pipe("textcat"))
 								    bytes_data = nlp.to_bytes()
 								    new_nlp = English()
 								    new_nlp.add_pipe(nlp.create_pipe("textcat"))
 								    new_nlp.from_bytes(bytes_data)
 								def test_issue3328(en_vocab):
 								    doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"])
 								    matcher = Matcher(en_vocab)
 								    patterns = [
 								        [{"LOWER": {"IN": ["hello", "how"]}}],
 								        [{"LOWER": {"IN": ["you", "doing"]}}],
 								    ]
-												Implement new API for {Phrase}Matcher.add (backwards-compatible) (#4522)

* Implement new API for {Phrase}Matcher.add (backwards-compatible)

* Update docs

* Also update DependencyMatcher.add

* Update internals

* Rewrite tests to use new API

* Add basic check for common mistake

Raise error with suggestion if user likely passed in a pattern instead of a list of patterns

* Fix typo [ci skip]

											
										
										
											2019-10-25 20:21:08 +00:00
+								    matcher.add("TEST", patterns)
-												Merge regression tests

											
										
										
											2019-07-10 10:49:18 +00:00
+								    matches = matcher(doc)
 								    assert len(matches) == 4
 								    matched_texts = [doc[start:end].text for _, start, end in matches]
 								    assert matched_texts == ["Hello", "how", "you", "doing"]
 								def test_issue3331(en_vocab):
 								    """Test that duplicate patterns for different rules result in multiple
 								    matches, one per rule.
 								    """
 								    matcher = PhraseMatcher(en_vocab)
-												Implement new API for {Phrase}Matcher.add (backwards-compatible) (#4522)

* Implement new API for {Phrase}Matcher.add (backwards-compatible)

* Update docs

* Also update DependencyMatcher.add

* Update internals

* Rewrite tests to use new API

* Add basic check for common mistake

Raise error with suggestion if user likely passed in a pattern instead of a list of patterns

* Fix typo [ci skip]

											
										
										
											2019-10-25 20:21:08 +00:00
+								    matcher.add("A", [Doc(en_vocab, words=["Barack", "Obama"])])
 								    matcher.add("B", [Doc(en_vocab, words=["Barack", "Obama"])])
-												Merge regression tests

											
										
										
											2019-07-10 10:49:18 +00:00
+								    doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"])
 								    matches = matcher(doc)
 								    assert len(matches) == 2
 								    match_ids = [en_vocab.strings[matches[0][0]], en_vocab.strings[matches[1][0]]]
 								    assert sorted(match_ids) == ["A", "B"]
 								def test_issue3345():
 								    """Test case where preset entity crosses sentence boundary."""
 								    nlp = English()
 								    doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
 								    doc[4].is_sent_start = True
 								    ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
 								    ner = EntityRecognizer(doc.vocab)
 								    # Add the OUT action. I wouldn't have thought this would be necessary...
 								    ner.moves.add_action(5, "")
 								    ner.add_label("GPE")
 								    doc = ruler(doc)
 								    # Get into the state just before "New"
 								    state = ner.moves.init_batch([doc])[0]
 								    ner.moves.apply_transition(state, "O")
 								    ner.moves.apply_transition(state, "O")
 								    ner.moves.apply_transition(state, "O")
 								    # Check that B-GPE is valid.
 								    assert ner.moves.is_valid(state, "B-GPE")
 								def test_issue3410():
 								    texts = ["Hello world", "This is a test"]
 								    nlp = English()
 								    matcher = Matcher(nlp.vocab)
 								    phrasematcher = PhraseMatcher(nlp.vocab)
 								    with pytest.deprecated_call():
 								        docs = list(nlp.pipe(texts, n_threads=4))
 								    with pytest.deprecated_call():
 								        docs = list(nlp.tokenizer.pipe(texts, n_threads=4))
 								    with pytest.deprecated_call():
 								        list(matcher.pipe(docs, n_threads=4))
 								    with pytest.deprecated_call():
 								        list(phrasematcher.pipe(docs, n_threads=4))
-												prevent division by zero in most_similar method (#4488)


											
										
										
											2019-10-21 10:04:46 +00:00
+								def test_issue3412():
 								    data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f")
 								    vectors = Vectors(data=data)
-												Auto-format [ci skip]

											
										
										
											2019-10-24 14:21:08 +00:00
+								    keys, best_rows, scores = vectors.most_similar(
 								        numpy.asarray([[9, 8, 7], [0, 0, 0]], dtype="f")
 								    )
 								    assert best_rows[0] == 2
-												prevent division by zero in most_similar method (#4488)


											
										
										
											2019-10-21 10:04:46 +00:00
-												Merge regression tests

											
										
										
											2019-07-10 10:49:18 +00:00
+								def test_issue3447():
 								    sizes = decaying(10.0, 1.0, 0.5)
 								    size = next(sizes)
 								    assert size == 10.0
 								    size = next(sizes)
 								    assert size == 10.0 - 0.5
 								    size = next(sizes)
 								    assert size == 10.0 - 0.5 - 0.5
 								@pytest.mark.xfail(reason="default suffix rules avoid one upper-case letter before dot")
 								def test_issue3449():
 								    nlp = English()
 								    nlp.add_pipe(nlp.create_pipe("sentencizer"))
 								    text1 = "He gave the ball to I. Do you want to go to the movies with I?"
 								    text2 = "He gave the ball to I.  Do you want to go to the movies with I?"
 								    text3 = "He gave the ball to I.\nDo you want to go to the movies with I?"
 								    t1 = nlp(text1)
 								    t2 = nlp(text2)
 								    t3 = nlp(text3)
 								    assert t1[5].text == "I"
 								    assert t2[5].text == "I"
 								    assert t3[5].text == "I"
-												Fix xpassing tests (#4657)

* Ignore internal warnings

* Un-xfail passing tests

* Skip instead of xfail

											
										
										
											2019-11-16 19:20:53 +00:00
+								@pytest.mark.filterwarnings("ignore::UserWarning")
-												Ensure training doesn't crash with empty batches (#4360)

* unit test for previously resolved unflatten issue

* prevent batch of empty docs to cause problems

											
										
										
											2019-10-02 10:50:48 +00:00
+								def test_issue3456():
 								    # this crashed because of a padding error in layer.ops.unflatten in thinc
 								    nlp = English()
 								    nlp.add_pipe(nlp.create_pipe("tagger"))
 								    nlp.begin_training()
-												Tidy up and auto-format

											
										
										
											2019-10-18 09:27:38 +00:00
+								    list(nlp.pipe(["hi", ""]))
-												Ensure training doesn't crash with empty batches (#4360)

* unit test for previously resolved unflatten issue

* prevent batch of empty docs to cause problems

											
										
										
											2019-10-02 10:50:48 +00:00
-												Merge regression tests

											
										
										
											2019-07-10 10:49:18 +00:00
+								def test_issue3468():
 								    """Test that sentence boundaries are set correctly so Doc.is_sentenced can
 								    be restored after serialization."""
 								    nlp = English()
 								    nlp.add_pipe(nlp.create_pipe("sentencizer"))
 								    doc = nlp("Hello world")
 								    assert doc[0].is_sent_start
 								    assert doc.is_sentenced
 								    assert len(list(doc.sents)) == 1
 								    doc_bytes = doc.to_bytes()
 								    new_doc = Doc(nlp.vocab).from_bytes(doc_bytes)
 								    assert new_doc[0].is_sent_start
 								    assert new_doc.is_sentenced
 								    assert len(list(new_doc.sents)) == 1