spaCy/spacy/tests/regression/test_issue3001-3500.py

import pytest
from spacy.lang.en import English
from spacy.lang.de import German
from spacy.ml.models.defaults import default_ner
from spacy.pipeline import EntityRuler, EntityRecognizer
from spacy.matcher import Matcher, PhraseMatcher
from spacy.tokens import Doc
from spacy.vocab import Vocab
from spacy.attrs import ENT_IOB, ENT_TYPE
from spacy.compat import pickle
from spacy import displacy
from spacy.util import decaying
import numpy

from spacy.vectors import Vectors
from ..util import get_doc


def test_issue3002():
    """Test that the tokenizer doesn't hang on a long list of dots"""
    nlp = German()
    doc = nlp(
        "880.794.982.218.444.893.023.439.794.626.120.190.780.624.990.275.671 ist eine lange Zahl"
    )
    assert len(doc) == 5


def test_issue3009(en_vocab):
    """Test problem with matcher quantifiers"""
    patterns = [
        [{"LEMMA": "have"}, {"LOWER": "to"}, {"LOWER": "do"}, {"TAG": "IN"}],
        [
            {"LEMMA": "have"},
            {"IS_ASCII": True, "IS_PUNCT": False, "OP": "*"},
            {"LOWER": "to"},
            {"LOWER": "do"},
            {"TAG": "IN"},
        ],
        [
            {"LEMMA": "have"},
            {"IS_ASCII": True, "IS_PUNCT": False, "OP": "?"},
            {"LOWER": "to"},
            {"LOWER": "do"},
            {"TAG": "IN"},
        ],
    ]
    words = ["also", "has", "to", "do", "with"]
    tags = ["RB", "VBZ", "TO", "VB", "IN"]
    doc = get_doc(en_vocab, words=words, tags=tags)
    matcher = Matcher(en_vocab)
    for i, pattern in enumerate(patterns):
        matcher.add(str(i), [pattern])
        matches = matcher(doc)
        assert matches


def test_issue3012(en_vocab):
    """Test that the is_tagged attribute doesn't get overwritten when we from_array
    without tag information."""
    words = ["This", "is", "10", "%", "."]
    tags = ["DT", "VBZ", "CD", "NN", "."]
    pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
    ents = [(2, 4, "PERCENT")]
    doc = get_doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
    assert doc.is_tagged

    expected = ("10", "NUM", "CD", "PERCENT")
    assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected

    header = [ENT_IOB, ENT_TYPE]
    ent_array = doc.to_array(header)
    doc.from_array(header, ent_array)

    assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected

    # Serializing then deserializing
    doc_bytes = doc.to_bytes()
    doc2 = Doc(en_vocab).from_bytes(doc_bytes)
    assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected


def test_issue3199():
    """Test that Span.noun_chunks works correctly if no noun chunks iterator
    is available. To make this test future-proof, we're constructing a Doc
    with a new Vocab here and setting is_parsed to make sure the noun chunks run.
    """
    doc = Doc(Vocab(), words=["This", "is", "a", "sentence"])
    doc.is_parsed = True
    assert list(doc[0:3].noun_chunks) == []


def test_issue3209():
    """Test issue that occurred in spaCy nightly where NER labels were being
    mapped to classes incorrectly after loading the model, when the labels
    were added using ner.add_label().
    """
    nlp = English()
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)

    ner.add_label("ANIMAL")
    nlp.begin_training()
    move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"]
    assert ner.move_names == move_names
    nlp2 = English()
    nlp2.add_pipe(nlp2.create_pipe("ner"))
    nlp2.get_pipe("ner").model.resize_output(ner.moves.n_moves)
    nlp2.from_bytes(nlp.to_bytes())
    assert nlp2.get_pipe("ner").move_names == move_names


def test_issue3248_1():
    """Test that the PhraseMatcher correctly reports its number of rules, not
    total number of patterns."""
    nlp = English()
    matcher = PhraseMatcher(nlp.vocab)
    matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
    matcher.add("TEST2", [nlp("d")])
    assert len(matcher) == 2


def test_issue3248_2():
    """Test that the PhraseMatcher can be pickled correctly."""
    nlp = English()
    matcher = PhraseMatcher(nlp.vocab)
    matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
    matcher.add("TEST2", [nlp("d")])
    data = pickle.dumps(matcher)
    new_matcher = pickle.loads(data)
    assert len(new_matcher) == len(matcher)


def test_issue3277(es_tokenizer):
    """Test that hyphens are split correctly as prefixes."""
    doc = es_tokenizer("—Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.")
    assert len(doc) == 14
    assert doc[0].text == "\u2014"
    assert doc[5].text == "\u2013"
    assert doc[9].text == "\u2013"


def test_issue3288(en_vocab):
    """Test that retokenization works correctly via displaCy when punctuation
    is merged onto the preceeding token and tensor is resized."""
    words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"]
    heads = [1, 0, -1, 1, 0, 1, -2, -3]
    deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"]
    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
    doc.tensor = numpy.zeros((len(words), 96), dtype="float32")
    displacy.render(doc)


def test_issue3289():
    """Test that Language.to_bytes handles serializing a pipeline component
    with an uninitialized model."""
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("textcat"))
    bytes_data = nlp.to_bytes()
    new_nlp = English()
    new_nlp.add_pipe(nlp.create_pipe("textcat"))
    new_nlp.from_bytes(bytes_data)


def test_issue3328(en_vocab):
    doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"])
    matcher = Matcher(en_vocab)
    patterns = [
        [{"LOWER": {"IN": ["hello", "how"]}}],
        [{"LOWER": {"IN": ["you", "doing"]}}],
    ]
    matcher.add("TEST", patterns)
    matches = matcher(doc)
    assert len(matches) == 4
    matched_texts = [doc[start:end].text for _, start, end in matches]
    assert matched_texts == ["Hello", "how", "you", "doing"]


def test_issue3331(en_vocab):
    """Test that duplicate patterns for different rules result in multiple
    matches, one per rule.
    """
    matcher = PhraseMatcher(en_vocab)
    matcher.add("A", [Doc(en_vocab, words=["Barack", "Obama"])])
    matcher.add("B", [Doc(en_vocab, words=["Barack", "Obama"])])
    doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"])
    matches = matcher(doc)
    assert len(matches) == 2
    match_ids = [en_vocab.strings[matches[0][0]], en_vocab.strings[matches[1][0]]]
    assert sorted(match_ids) == ["A", "B"]


def test_issue3345():
    """Test case where preset entity crosses sentence boundary."""
    nlp = English()
    doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
    doc[4].is_sent_start = True
    ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
    ner = EntityRecognizer(doc.vocab, default_ner())
    # Add the OUT action. I wouldn't have thought this would be necessary...
    ner.moves.add_action(5, "")
    ner.add_label("GPE")
    doc = ruler(doc)
    # Get into the state just before "New"
    state = ner.moves.init_batch([doc])[0]
    ner.moves.apply_transition(state, "O")
    ner.moves.apply_transition(state, "O")
    ner.moves.apply_transition(state, "O")
    # Check that B-GPE is valid.
    assert ner.moves.is_valid(state, "B-GPE")


def test_issue3410():
    texts = ["Hello world", "This is a test"]
    nlp = English()
    matcher = Matcher(nlp.vocab)
    phrasematcher = PhraseMatcher(nlp.vocab)
    with pytest.deprecated_call():
        docs = list(nlp.pipe(texts, n_threads=4))
    with pytest.deprecated_call():
        docs = list(nlp.tokenizer.pipe(texts, n_threads=4))
    with pytest.deprecated_call():
        list(matcher.pipe(docs, n_threads=4))
    with pytest.deprecated_call():
        list(phrasematcher.pipe(docs, n_threads=4))


def test_issue3412():
    data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f")
    vectors = Vectors(data=data)
    keys, best_rows, scores = vectors.most_similar(
        numpy.asarray([[9, 8, 7], [0, 0, 0]], dtype="f")
    )
    assert best_rows[0] == 2


def test_issue3447():
    sizes = decaying(10.0, 1.0, 0.5)
    size = next(sizes)
    assert size == 10.0
    size = next(sizes)
    assert size == 10.0 - 0.5
    size = next(sizes)
    assert size == 10.0 - 0.5 - 0.5


@pytest.mark.xfail(reason="default suffix rules avoid one upper-case letter before dot")
def test_issue3449():
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
    text1 = "He gave the ball to I. Do you want to go to the movies with I?"
    text2 = "He gave the ball to I.  Do you want to go to the movies with I?"
    text3 = "He gave the ball to I.\nDo you want to go to the movies with I?"
    t1 = nlp(text1)
    t2 = nlp(text2)
    t3 = nlp(text3)
    assert t1[5].text == "I"
    assert t2[5].text == "I"
    assert t3[5].text == "I"


@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue3456():
    # this crashed because of a padding error in layer.ops.unflatten in thinc
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("tagger"))
    nlp.begin_training()
    list(nlp.pipe(["hi", ""]))


def test_issue3468():
    """Test that sentence boundaries are set correctly so Doc.is_sentenced can
    be restored after serialization."""
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
    doc = nlp("Hello world")
    assert doc[0].is_sent_start
    assert doc.is_sentenced
    assert len(list(doc.sents)) == 1
    doc_bytes = doc.to_bytes()
    new_doc = Doc(nlp.vocab).from_bytes(doc_bytes)
    assert new_doc[0].is_sent_start
    assert new_doc.is_sentenced
    assert len(list(new_doc.sents)) == 1
-												Merge regression tests

											
										
										
											2019-07-10 10:49:18 +00:00
+								import pytest
 								from spacy.lang.en import English
 								from spacy.lang.de import German
-												Default settings to configurations (#4995)

* fix grad_clip naming

* cleaning up pretrained_vectors out of cfg

* further refactoring Model init's

* move Model building out of pipes

* further refactor to require a model config when creating a pipe

* small fixes

* making cfg in nn_parser more consistent

* fixing nr_class for parser

* fixing nn_parser's nO

* fix printing of loss

* architectures in own file per type, consistent naming

* convenience methods default_tagger_config and default_tok2vec_config

* let create_pipe access default config if available for that component

* default_parser_config

* move defaults to separate folder

* allow reading nlp from package or dir with argument 'name'

* architecture spacy.VocabVectors.v1 to read static vectors from file

* cleanup

* default configs for nel, textcat, morphologizer, tensorizer

* fix imports

* fixing unit tests

* fixes and clean up

* fixing defaults, nO, fix unit tests

* restore parser IO

* fix IO

* 'fix' serialization test

* add *.cfg to manifest

* fix example configs with additional arguments

* replace Morpohologizer with Tagger

* add IO bit when testing overfitting of tagger (currently failing)

* fix IO - don't initialize when reading from disk

* expand overfitting tests to also check IO goes OK

* remove dropout from HashEmbed to fix Tagger performance

* add defaults for sentrec

* update thinc

* always pass a Model instance to a Pipe

* fix piped_added statement

* remove obsolete W029

* remove obsolete errors

* restore byte checking tests (work again)

* clean up test

* further test cleanup

* convert from config to Model in create_pipe

* bring back error when component is not initialized

* cleanup

* remove calls for nlp2.begin_training

* use thinc.api in imports

* allow setting charembed's nM and nC

* fix for hardcoded nM/nC + unit test

* formatting fixes

* trigger build

											
										
										
											2020-02-27 17:42:27 +00:00
+								from spacy.ml.models.defaults import default_ner
-												Merge regression tests

											
										
										
											2019-07-10 10:49:18 +00:00
+								from spacy.pipeline import EntityRuler, EntityRecognizer
 								from spacy.matcher import Matcher, PhraseMatcher
 								from spacy.tokens import Doc
 								from spacy.vocab import Vocab
 								from spacy.attrs import ENT_IOB, ENT_TYPE
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 00:53:56 +00:00
+								from spacy.compat import pickle
-												Merge regression tests

											
										
										
											2019-07-10 10:49:18 +00:00
+								from spacy import displacy
 								from spacy.util import decaying
 								import numpy
-												prevent division by zero in most_similar method (#4488)


											
										
										
											2019-10-21 10:04:46 +00:00
+								from spacy.vectors import Vectors
-												Merge regression tests

											
										
										
											2019-07-10 10:49:18 +00:00
+								from ..util import get_doc
 								def test_issue3002():
 								    """Test that the tokenizer doesn't hang on a long list of dots"""
 								    nlp = German()
 								    doc = nlp(
 								        "880.794.982.218.444.893.023.439.794.626.120.190.780.624.990.275.671 ist eine lange Zahl"
 								    )
 								    assert len(doc) == 5
 								def test_issue3009(en_vocab):
 								    """Test problem with matcher quantifiers"""
 								    patterns = [
-												Make regression test less sensitive to tag-map stuff

											
										
										
											2019-08-25 19:54:26 +00:00
+								        [{"LEMMA": "have"}, {"LOWER": "to"}, {"LOWER": "do"}, {"TAG": "IN"}],
-												Merge regression tests

											
										
										
											2019-07-10 10:49:18 +00:00
+								        [
 								            {"LEMMA": "have"},
 								            {"IS_ASCII": True, "IS_PUNCT": False, "OP": "*"},
 								            {"LOWER": "to"},
 								            {"LOWER": "do"},
-												Make regression test less sensitive to tag-map stuff

											
										
										
											2019-08-25 19:54:26 +00:00
+								            {"TAG": "IN"},
-												Merge regression tests

											
										
										
											2019-07-10 10:49:18 +00:00
+								        ],
 								        [
 								            {"LEMMA": "have"},
 								            {"IS_ASCII": True, "IS_PUNCT": False, "OP": "?"},
 								            {"LOWER": "to"},
 								            {"LOWER": "do"},
-												Make regression test less sensitive to tag-map stuff

											
										
										
											2019-08-25 19:54:26 +00:00
+								            {"TAG": "IN"},
-												Merge regression tests

											
										
										
											2019-07-10 10:49:18 +00:00
+								        ],
 								    ]
 								    words = ["also", "has", "to", "do", "with"]
 								    tags = ["RB", "VBZ", "TO", "VB", "IN"]
 								    doc = get_doc(en_vocab, words=words, tags=tags)
 								    matcher = Matcher(en_vocab)
 								    for i, pattern in enumerate(patterns):
-												Implement new API for {Phrase}Matcher.add (backwards-compatible) (#4522)

* Implement new API for {Phrase}Matcher.add (backwards-compatible)

* Update docs

* Also update DependencyMatcher.add

* Update internals

* Rewrite tests to use new API

* Add basic check for common mistake

Raise error with suggestion if user likely passed in a pattern instead of a list of patterns

* Fix typo [ci skip]

											
										
										
											2019-10-25 20:21:08 +00:00
+								        matcher.add(str(i), [pattern])
-												Merge regression tests

											
										
										
											2019-07-10 10:49:18 +00:00
+								        matches = matcher(doc)
 								        assert matches
 								def test_issue3012(en_vocab):
 								    """Test that the is_tagged attribute doesn't get overwritten when we from_array
 								    without tag information."""
 								    words = ["This", "is", "10", "%", "."]
 								    tags = ["DT", "VBZ", "CD", "NN", "."]
 								    pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
 								    ents = [(2, 4, "PERCENT")]
 								    doc = get_doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
 								    assert doc.is_tagged
 								    expected = ("10", "NUM", "CD", "PERCENT")
 								    assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
 								    header = [ENT_IOB, ENT_TYPE]
 								    ent_array = doc.to_array(header)
 								    doc.from_array(header, ent_array)
 								    assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
 								    # Serializing then deserializing
 								    doc_bytes = doc.to_bytes()
 								    doc2 = Doc(en_vocab).from_bytes(doc_bytes)
 								    assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected
 								def test_issue3199():
 								    """Test that Span.noun_chunks works correctly if no noun chunks iterator
 								    is available. To make this test future-proof, we're constructing a Doc
 								    with a new Vocab here and setting is_parsed to make sure the noun chunks run.
 								    """
 								    doc = Doc(Vocab(), words=["This", "is", "a", "sentence"])
 								    doc.is_parsed = True
 								    assert list(doc[0:3].noun_chunks) == []
 								def test_issue3209():
 								    """Test issue that occurred in spaCy nightly where NER labels were being
 								    mapped to classes incorrectly after loading the model, when the labels
 								    were added using ner.add_label().
 								    """
 								    nlp = English()
 								    ner = nlp.create_pipe("ner")
 								    nlp.add_pipe(ner)
 								    ner.add_label("ANIMAL")
 								    nlp.begin_training()
 								    move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"]
 								    assert ner.move_names == move_names
 								    nlp2 = English()
 								    nlp2.add_pipe(nlp2.create_pipe("ner"))
-												Default settings to configurations (#4995)

* fix grad_clip naming

* cleaning up pretrained_vectors out of cfg

* further refactoring Model init's

* move Model building out of pipes

* further refactor to require a model config when creating a pipe

* small fixes

* making cfg in nn_parser more consistent

* fixing nr_class for parser

* fixing nn_parser's nO

* fix printing of loss

* architectures in own file per type, consistent naming

* convenience methods default_tagger_config and default_tok2vec_config

* let create_pipe access default config if available for that component

* default_parser_config

* move defaults to separate folder

* allow reading nlp from package or dir with argument 'name'

* architecture spacy.VocabVectors.v1 to read static vectors from file

* cleanup

* default configs for nel, textcat, morphologizer, tensorizer

* fix imports

* fixing unit tests

* fixes and clean up

* fixing defaults, nO, fix unit tests

* restore parser IO

* fix IO

* 'fix' serialization test

* add *.cfg to manifest

* fix example configs with additional arguments

* replace Morpohologizer with Tagger

* add IO bit when testing overfitting of tagger (currently failing)

* fix IO - don't initialize when reading from disk

* expand overfitting tests to also check IO goes OK

* remove dropout from HashEmbed to fix Tagger performance

* add defaults for sentrec

* update thinc

* always pass a Model instance to a Pipe

* fix piped_added statement

* remove obsolete W029

* remove obsolete errors

* restore byte checking tests (work again)

* clean up test

* further test cleanup

* convert from config to Model in create_pipe

* bring back error when component is not initialized

* cleanup

* remove calls for nlp2.begin_training

* use thinc.api in imports

* allow setting charembed's nM and nC

* fix for hardcoded nM/nC + unit test

* formatting fixes

* trigger build

											
										
										
											2020-02-27 17:42:27 +00:00
+								    nlp2.get_pipe("ner").model.resize_output(ner.moves.n_moves)
-												Merge regression tests

											
										
										
											2019-07-10 10:49:18 +00:00
+								    nlp2.from_bytes(nlp.to_bytes())
 								    assert nlp2.get_pipe("ner").move_names == move_names
 								def test_issue3248_1():
 								    """Test that the PhraseMatcher correctly reports its number of rules, not
 								    total number of patterns."""
 								    nlp = English()
 								    matcher = PhraseMatcher(nlp.vocab)
-												Implement new API for {Phrase}Matcher.add (backwards-compatible) (#4522)

* Implement new API for {Phrase}Matcher.add (backwards-compatible)

* Update docs

* Also update DependencyMatcher.add

* Update internals

* Rewrite tests to use new API

* Add basic check for common mistake

Raise error with suggestion if user likely passed in a pattern instead of a list of patterns

* Fix typo [ci skip]

											
										
										
											2019-10-25 20:21:08 +00:00
+								    matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
 								    matcher.add("TEST2", [nlp("d")])
-												Merge regression tests

											
										
										
											2019-07-10 10:49:18 +00:00
+								    assert len(matcher) == 2
 								def test_issue3248_2():
 								    """Test that the PhraseMatcher can be pickled correctly."""
 								    nlp = English()
 								    matcher = PhraseMatcher(nlp.vocab)
-												Implement new API for {Phrase}Matcher.add (backwards-compatible) (#4522)

* Implement new API for {Phrase}Matcher.add (backwards-compatible)

* Update docs

* Also update DependencyMatcher.add

* Update internals

* Rewrite tests to use new API

* Add basic check for common mistake

Raise error with suggestion if user likely passed in a pattern instead of a list of patterns

* Fix typo [ci skip]

											
										
										
											2019-10-25 20:21:08 +00:00
+								    matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
 								    matcher.add("TEST2", [nlp("d")])
-												Merge regression tests

											
										
										
											2019-07-10 10:49:18 +00:00
+								    data = pickle.dumps(matcher)
 								    new_matcher = pickle.loads(data)
 								    assert len(new_matcher) == len(matcher)
 								def test_issue3277(es_tokenizer):
 								    """Test that hyphens are split correctly as prefixes."""
 								    doc = es_tokenizer("—Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.")
 								    assert len(doc) == 14
 								    assert doc[0].text == "\u2014"
 								    assert doc[5].text == "\u2013"
 								    assert doc[9].text == "\u2013"
 								def test_issue3288(en_vocab):
 								    """Test that retokenization works correctly via displaCy when punctuation
 								    is merged onto the preceeding token and tensor is resized."""
 								    words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"]
 								    heads = [1, 0, -1, 1, 0, 1, -2, -3]
 								    deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"]
 								    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
 								    doc.tensor = numpy.zeros((len(words), 96), dtype="float32")
 								    displacy.render(doc)
 								def test_issue3289():
 								    """Test that Language.to_bytes handles serializing a pipeline component
 								    with an uninitialized model."""
 								    nlp = English()
 								    nlp.add_pipe(nlp.create_pipe("textcat"))
 								    bytes_data = nlp.to_bytes()
 								    new_nlp = English()
 								    new_nlp.add_pipe(nlp.create_pipe("textcat"))
 								    new_nlp.from_bytes(bytes_data)
 								def test_issue3328(en_vocab):
 								    doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"])
 								    matcher = Matcher(en_vocab)
 								    patterns = [
 								        [{"LOWER": {"IN": ["hello", "how"]}}],
 								        [{"LOWER": {"IN": ["you", "doing"]}}],
 								    ]
-												Implement new API for {Phrase}Matcher.add (backwards-compatible) (#4522)

* Implement new API for {Phrase}Matcher.add (backwards-compatible)

* Update docs

* Also update DependencyMatcher.add

* Update internals

* Rewrite tests to use new API

* Add basic check for common mistake

Raise error with suggestion if user likely passed in a pattern instead of a list of patterns

* Fix typo [ci skip]

											
										
										
											2019-10-25 20:21:08 +00:00
+								    matcher.add("TEST", patterns)
-												Merge regression tests

											
										
										
											2019-07-10 10:49:18 +00:00
+								    matches = matcher(doc)
 								    assert len(matches) == 4
 								    matched_texts = [doc[start:end].text for _, start, end in matches]
 								    assert matched_texts == ["Hello", "how", "you", "doing"]
 								def test_issue3331(en_vocab):
 								    """Test that duplicate patterns for different rules result in multiple
 								    matches, one per rule.
 								    """
 								    matcher = PhraseMatcher(en_vocab)
-												Implement new API for {Phrase}Matcher.add (backwards-compatible) (#4522)

* Implement new API for {Phrase}Matcher.add (backwards-compatible)

* Update docs

* Also update DependencyMatcher.add

* Update internals

* Rewrite tests to use new API

* Add basic check for common mistake

Raise error with suggestion if user likely passed in a pattern instead of a list of patterns

* Fix typo [ci skip]

											
										
										
											2019-10-25 20:21:08 +00:00
+								    matcher.add("A", [Doc(en_vocab, words=["Barack", "Obama"])])
 								    matcher.add("B", [Doc(en_vocab, words=["Barack", "Obama"])])
-												Merge regression tests

											
										
										
											2019-07-10 10:49:18 +00:00
+								    doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"])
 								    matches = matcher(doc)
 								    assert len(matches) == 2
 								    match_ids = [en_vocab.strings[matches[0][0]], en_vocab.strings[matches[1][0]]]
 								    assert sorted(match_ids) == ["A", "B"]
 								def test_issue3345():
 								    """Test case where preset entity crosses sentence boundary."""
 								    nlp = English()
 								    doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
 								    doc[4].is_sent_start = True
 								    ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
-												Default settings to configurations (#4995)

* fix grad_clip naming

* cleaning up pretrained_vectors out of cfg

* further refactoring Model init's

* move Model building out of pipes

* further refactor to require a model config when creating a pipe

* small fixes

* making cfg in nn_parser more consistent

* fixing nr_class for parser

* fixing nn_parser's nO

* fix printing of loss

* architectures in own file per type, consistent naming

* convenience methods default_tagger_config and default_tok2vec_config

* let create_pipe access default config if available for that component

* default_parser_config

* move defaults to separate folder

* allow reading nlp from package or dir with argument 'name'

* architecture spacy.VocabVectors.v1 to read static vectors from file

* cleanup

* default configs for nel, textcat, morphologizer, tensorizer

* fix imports

* fixing unit tests

* fixes and clean up

* fixing defaults, nO, fix unit tests

* restore parser IO

* fix IO

* 'fix' serialization test

* add *.cfg to manifest

* fix example configs with additional arguments

* replace Morpohologizer with Tagger

* add IO bit when testing overfitting of tagger (currently failing)

* fix IO - don't initialize when reading from disk

* expand overfitting tests to also check IO goes OK

* remove dropout from HashEmbed to fix Tagger performance

* add defaults for sentrec

* update thinc

* always pass a Model instance to a Pipe

* fix piped_added statement

* remove obsolete W029

* remove obsolete errors

* restore byte checking tests (work again)

* clean up test

* further test cleanup

* convert from config to Model in create_pipe

* bring back error when component is not initialized

* cleanup

* remove calls for nlp2.begin_training

* use thinc.api in imports

* allow setting charembed's nM and nC

* fix for hardcoded nM/nC + unit test

* formatting fixes

* trigger build

											
										
										
											2020-02-27 17:42:27 +00:00
+								    ner = EntityRecognizer(doc.vocab, default_ner())
-												Merge regression tests

											
										
										
											2019-07-10 10:49:18 +00:00
+								    # Add the OUT action. I wouldn't have thought this would be necessary...
 								    ner.moves.add_action(5, "")
 								    ner.add_label("GPE")
 								    doc = ruler(doc)
 								    # Get into the state just before "New"
 								    state = ner.moves.init_batch([doc])[0]
 								    ner.moves.apply_transition(state, "O")
 								    ner.moves.apply_transition(state, "O")
 								    ner.moves.apply_transition(state, "O")
 								    # Check that B-GPE is valid.
 								    assert ner.moves.is_valid(state, "B-GPE")
 								def test_issue3410():
 								    texts = ["Hello world", "This is a test"]
 								    nlp = English()
 								    matcher = Matcher(nlp.vocab)
 								    phrasematcher = PhraseMatcher(nlp.vocab)
 								    with pytest.deprecated_call():
 								        docs = list(nlp.pipe(texts, n_threads=4))
 								    with pytest.deprecated_call():
 								        docs = list(nlp.tokenizer.pipe(texts, n_threads=4))
 								    with pytest.deprecated_call():
 								        list(matcher.pipe(docs, n_threads=4))
 								    with pytest.deprecated_call():
 								        list(phrasematcher.pipe(docs, n_threads=4))
-												prevent division by zero in most_similar method (#4488)


											
										
										
											2019-10-21 10:04:46 +00:00
+								def test_issue3412():
 								    data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f")
 								    vectors = Vectors(data=data)
-												Auto-format [ci skip]

											
										
										
											2019-10-24 14:21:08 +00:00
+								    keys, best_rows, scores = vectors.most_similar(
 								        numpy.asarray([[9, 8, 7], [0, 0, 0]], dtype="f")
 								    )
 								    assert best_rows[0] == 2
-												prevent division by zero in most_similar method (#4488)


											
										
										
											2019-10-21 10:04:46 +00:00
-												Merge regression tests

											
										
										
											2019-07-10 10:49:18 +00:00
+								def test_issue3447():
 								    sizes = decaying(10.0, 1.0, 0.5)
 								    size = next(sizes)
 								    assert size == 10.0
 								    size = next(sizes)
 								    assert size == 10.0 - 0.5
 								    size = next(sizes)
 								    assert size == 10.0 - 0.5 - 0.5
 								@pytest.mark.xfail(reason="default suffix rules avoid one upper-case letter before dot")
 								def test_issue3449():
 								    nlp = English()
 								    nlp.add_pipe(nlp.create_pipe("sentencizer"))
 								    text1 = "He gave the ball to I. Do you want to go to the movies with I?"
 								    text2 = "He gave the ball to I.  Do you want to go to the movies with I?"
 								    text3 = "He gave the ball to I.\nDo you want to go to the movies with I?"
 								    t1 = nlp(text1)
 								    t2 = nlp(text2)
 								    t3 = nlp(text3)
 								    assert t1[5].text == "I"
 								    assert t2[5].text == "I"
 								    assert t3[5].text == "I"
-												Fix xpassing tests (#4657)

* Ignore internal warnings

* Un-xfail passing tests

* Skip instead of xfail

											
										
										
											2019-11-16 19:20:53 +00:00
+								@pytest.mark.filterwarnings("ignore::UserWarning")
-												Ensure training doesn't crash with empty batches (#4360)

* unit test for previously resolved unflatten issue

* prevent batch of empty docs to cause problems

											
										
										
											2019-10-02 10:50:48 +00:00
+								def test_issue3456():
 								    # this crashed because of a padding error in layer.ops.unflatten in thinc
 								    nlp = English()
 								    nlp.add_pipe(nlp.create_pipe("tagger"))
 								    nlp.begin_training()
-												Tidy up and auto-format

											
										
										
											2019-10-18 09:27:38 +00:00
+								    list(nlp.pipe(["hi", ""]))
-												Ensure training doesn't crash with empty batches (#4360)

* unit test for previously resolved unflatten issue

* prevent batch of empty docs to cause problems

											
										
										
											2019-10-02 10:50:48 +00:00
-												Merge regression tests

											
										
										
											2019-07-10 10:49:18 +00:00
+								def test_issue3468():
 								    """Test that sentence boundaries are set correctly so Doc.is_sentenced can
 								    be restored after serialization."""
 								    nlp = English()
 								    nlp.add_pipe(nlp.create_pipe("sentencizer"))
 								    doc = nlp("Hello world")
 								    assert doc[0].is_sent_start
 								    assert doc.is_sentenced
 								    assert len(list(doc.sents)) == 1
 								    doc_bytes = doc.to_bytes()
 								    new_doc = Doc(nlp.vocab).from_bytes(doc_bytes)
 								    assert new_doc[0].is_sent_start
 								    assert new_doc.is_sentenced
 								    assert len(list(new_doc.sents)) == 1