spaCy/spacy/tests/regression/test_issue3501-4000.py

import pytest
from spacy.language import Language
from spacy.vocab import Vocab
from spacy.pipeline import EntityRuler, DependencyParser
from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
from spacy import displacy, load
from spacy.displacy import parse_deps
from spacy.tokens import Doc, Token
from spacy.matcher import Matcher, PhraseMatcher
from spacy.errors import MatchPatternError
from spacy.util import minibatch
from spacy.training import Example
from spacy.lang.hi import Hindi
from spacy.lang.es import Spanish
from spacy.lang.en import English
from spacy.attrs import IS_ALPHA
from spacy import registry
from thinc.api import compounding
import spacy
import srsly
import numpy

from ..util import make_tempdir, get_doc


@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
def test_issue3521(en_tokenizer, word):
    tok = en_tokenizer(word)[1]
    # 'not' and 'would' should be stopwords, also in their abbreviated forms
    assert tok.is_stop


def test_issue_3526_1(en_vocab):
    patterns = [
        {"label": "HELLO", "pattern": "hello world"},
        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
    ]
    nlp = Language(vocab=en_vocab)
    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
    ruler_bytes = ruler.to_bytes()
    assert len(ruler) == len(patterns)
    assert len(ruler.labels) == 4
    assert ruler.overwrite
    new_ruler = EntityRuler(nlp)
    new_ruler = new_ruler.from_bytes(ruler_bytes)
    assert len(new_ruler) == len(ruler)
    assert len(new_ruler.labels) == 4
    assert new_ruler.overwrite == ruler.overwrite
    assert new_ruler.ent_id_sep == ruler.ent_id_sep


def test_issue_3526_2(en_vocab):
    patterns = [
        {"label": "HELLO", "pattern": "hello world"},
        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
    ]
    nlp = Language(vocab=en_vocab)
    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
    bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
    new_ruler = EntityRuler(nlp)
    new_ruler = new_ruler.from_bytes(bytes_old_style)
    assert len(new_ruler) == len(ruler)
    for pattern in ruler.patterns:
        assert pattern in new_ruler.patterns
    assert new_ruler.overwrite is not ruler.overwrite


def test_issue_3526_3(en_vocab):
    patterns = [
        {"label": "HELLO", "pattern": "hello world"},
        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
    ]
    nlp = Language(vocab=en_vocab)
    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
    with make_tempdir() as tmpdir:
        out_file = tmpdir / "entity_ruler"
        srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
        new_ruler = EntityRuler(nlp).from_disk(out_file)
        for pattern in ruler.patterns:
            assert pattern in new_ruler.patterns
        assert len(new_ruler) == len(ruler)
        assert new_ruler.overwrite is not ruler.overwrite


def test_issue_3526_4(en_vocab):
    nlp = Language(vocab=en_vocab)
    patterns = [{"label": "ORG", "pattern": "Apple"}]
    config = {"overwrite_ents": True}
    ruler = nlp.add_pipe("entity_ruler", config=config)
    ruler.add_patterns(patterns)
    with make_tempdir() as tmpdir:
        nlp.to_disk(tmpdir)
        ruler = nlp.get_pipe("entity_ruler")
        assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
        assert ruler.overwrite is True
        nlp2 = load(tmpdir)
        new_ruler = nlp2.get_pipe("entity_ruler")
        assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
        assert new_ruler.overwrite is True


def test_issue3531():
    """Test that displaCy renderer doesn't require "settings" key."""
    example_dep = {
        "words": [
            {"text": "But", "tag": "CCONJ"},
            {"text": "Google", "tag": "PROPN"},
            {"text": "is", "tag": "VERB"},
            {"text": "starting", "tag": "VERB"},
            {"text": "from", "tag": "ADP"},
            {"text": "behind.", "tag": "ADV"},
        ],
        "arcs": [
            {"start": 0, "end": 3, "label": "cc", "dir": "left"},
            {"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
            {"start": 2, "end": 3, "label": "aux", "dir": "left"},
            {"start": 3, "end": 4, "label": "prep", "dir": "right"},
            {"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
        ],
    }
    example_ent = {
        "text": "But Google is starting from behind.",
        "ents": [{"start": 4, "end": 10, "label": "ORG"}],
    }
    dep_html = displacy.render(example_dep, style="dep", manual=True)
    assert dep_html
    ent_html = displacy.render(example_ent, style="ent", manual=True)
    assert ent_html


def test_issue3540(en_vocab):
    words = ["I", "live", "in", "NewYork", "right", "now"]
    tensor = numpy.asarray(
        [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
        dtype="f",
    )
    doc = Doc(en_vocab, words=words)
    doc.tensor = tensor
    gold_text = ["I", "live", "in", "NewYork", "right", "now"]
    assert [token.text for token in doc] == gold_text
    gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
    for i, lemma in enumerate(gold_lemma):
        doc[i].lemma_ = lemma
    assert [token.lemma_ for token in doc] == gold_lemma
    vectors_1 = [token.vector for token in doc]
    assert len(vectors_1) == len(doc)

    with doc.retokenize() as retokenizer:
        heads = [(doc[3], 1), doc[2]]
        attrs = {
            "POS": ["PROPN", "PROPN"],
            "LEMMA": ["New", "York"],
            "DEP": ["pobj", "compound"],
        }
        retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)

    gold_text = ["I", "live", "in", "New", "York", "right", "now"]
    assert [token.text for token in doc] == gold_text
    gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
    assert [token.lemma_ for token in doc] == gold_lemma
    vectors_2 = [token.vector for token in doc]
    assert len(vectors_2) == len(doc)
    assert vectors_1[0].tolist() == vectors_2[0].tolist()
    assert vectors_1[1].tolist() == vectors_2[1].tolist()
    assert vectors_1[2].tolist() == vectors_2[2].tolist()
    assert vectors_1[4].tolist() == vectors_2[5].tolist()
    assert vectors_1[5].tolist() == vectors_2[6].tolist()


def test_issue3549(en_vocab):
    """Test that match pattern validation doesn't raise on empty errors."""
    matcher = Matcher(en_vocab, validate=True)
    pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
    matcher.add("GOOD", [pattern])
    with pytest.raises(MatchPatternError):
        matcher.add("BAD", [[{"X": "Y"}]])


@pytest.mark.skip("Matching currently only works on strings and integers")
def test_issue3555(en_vocab):
    """Test that custom extensions with default None don't break matcher."""
    Token.set_extension("issue3555", default=None)
    matcher = Matcher(en_vocab)
    pattern = [{"ORTH": "have"}, {"_": {"issue3555": True}}]
    matcher.add("TEST", [pattern])
    doc = Doc(en_vocab, words=["have", "apple"])
    matcher(doc)


def test_issue3611():
    """ Test whether adding n-grams in the textcat works even when n > token length of some docs """
    unique_classes = ["offensive", "inoffensive"]
    x_train = [
        "This is an offensive text",
        "This is the second offensive text",
        "inoff",
    ]
    y_train = ["offensive", "offensive", "inoffensive"]
    nlp = spacy.blank("en")
    # preparing the data
    train_data = []
    for text, train_instance in zip(x_train, y_train):
        cat_dict = {label: label == train_instance for label in unique_classes}
        train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
    # add a text categorizer component
    model = {
        "@architectures": "spacy.TextCatBOW.v1",
        "exclusive_classes": True,
        "ngram_size": 2,
        "no_output_layer": False,
    }
    textcat = nlp.add_pipe("textcat", config={"model": model}, last=True)
    for label in unique_classes:
        textcat.add_label(label)
    # training the network
    with nlp.select_pipes(enable="textcat"):
        optimizer = nlp.begin_training()
        for i in range(3):
            losses = {}
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))

            for batch in batches:
                nlp.update(
                    examples=batch, sgd=optimizer, drop=0.1, losses=losses,
                )


def test_issue3625():
    """Test that default punctuation rules applies to hindi unicode characters"""
    nlp = Hindi()
    doc = nlp("hi. how हुए. होटल, होटल")
    expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
    assert [token.text for token in doc] == expected


def test_issue3803():
    """Test that spanish num-like tokens have True for like_num attribute."""
    nlp = Spanish()
    text = "2 dos 1000 mil 12 doce"
    doc = nlp(text)

    assert [t.like_num for t in doc] == [True, True, True, True, True, True]


def _parser_example(parser):
    doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
    gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]}
    return Example.from_dict(doc, gold)


def test_issue3830_no_subtok():
    """Test that the parser doesn't have subtok label if not learn_tokens"""
    config = {
        "learn_tokens": False,
        "min_action_freq": 30,
        "update_with_oracle_cut_size": 100,
    }
    model = registry.make_from_config({"model": DEFAULT_PARSER_MODEL}, validate=True)[
        "model"
    ]
    parser = DependencyParser(Vocab(), model, **config)
    parser.add_label("nsubj")
    assert "subtok" not in parser.labels
    parser.begin_training(lambda: [_parser_example(parser)])
    assert "subtok" not in parser.labels


def test_issue3830_with_subtok():
    """Test that the parser does have subtok label if learn_tokens=True."""
    config = {
        "learn_tokens": True,
        "min_action_freq": 30,
        "update_with_oracle_cut_size": 100,
    }
    model = registry.make_from_config({"model": DEFAULT_PARSER_MODEL}, validate=True)[
        "model"
    ]
    parser = DependencyParser(Vocab(), model, **config)
    parser.add_label("nsubj")
    assert "subtok" not in parser.labels
    parser.begin_training(lambda: [_parser_example(parser)])
    assert "subtok" in parser.labels


def test_issue3839(en_vocab):
    """Test that match IDs returned by the matcher are correct, are in the string """
    doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
    matcher = Matcher(en_vocab)
    match_id = "PATTERN"
    pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
    pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
    matcher.add(match_id, [pattern1])
    matches = matcher(doc)
    assert matches[0][0] == en_vocab.strings[match_id]
    matcher = Matcher(en_vocab)
    matcher.add(match_id, [pattern2])
    matches = matcher(doc)
    assert matches[0][0] == en_vocab.strings[match_id]


@pytest.mark.parametrize(
    "sentence",
    [
        "The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
        "Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
        "It was a missed assignment, but it shouldn't have resulted in a turnover ...",
    ],
)
def test_issue3869(sentence):
    """Test that the Doc's count_by function works consistently"""
    nlp = English()
    doc = nlp(sentence)
    count = 0
    for token in doc:
        count += token.is_alpha
    assert count == doc.count_by(IS_ALPHA).get(1, 0)


def test_issue3879(en_vocab):
    doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
    assert len(doc) == 5
    pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [pattern])
    assert len(matcher(doc)) == 2  # fails because of a FP match 'is a test'


def test_issue3880():
    """Test that `nlp.pipe()` works when an empty string ends the batch.

    Fixed in v7.0.5 of Thinc.
    """
    texts = ["hello", "world", "", ""]
    nlp = English()
    nlp.add_pipe("parser").add_label("dep")
    nlp.add_pipe("ner").add_label("PERSON")
    nlp.add_pipe("tagger").add_label("NN")
    nlp.begin_training()
    for doc in nlp.pipe(texts):
        pass


def test_issue3882(en_vocab):
    """Test that displaCy doesn't serialize the doc.user_data when making a
    copy of the Doc.
    """
    doc = Doc(en_vocab, words=["Hello", "world"])
    doc.user_data["test"] = set()
    parse_deps(doc)


def test_issue3951(en_vocab):
    """Test that combinations of optional rules are matched correctly."""
    matcher = Matcher(en_vocab)
    pattern = [
        {"LOWER": "hello"},
        {"LOWER": "this", "OP": "?"},
        {"OP": "?"},
        {"LOWER": "world"},
    ]
    matcher.add("TEST", [pattern])
    doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
    matches = matcher(doc)
    assert len(matches) == 0


def test_issue3959():
    """ Ensure that a modified pos attribute is serialized correctly."""
    nlp = English()
    doc = nlp(
        "displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
    )
    assert doc[0].pos_ == ""
    doc[0].pos_ = "NOUN"
    assert doc[0].pos_ == "NOUN"
    # usually this is already True when starting from proper models instead of blank English
    with make_tempdir() as tmp_dir:
        file_path = tmp_dir / "my_doc"
        doc.to_disk(file_path)
        doc2 = nlp("")
        doc2.from_disk(file_path)
        assert doc2[0].pos_ == "NOUN"


def test_issue3962(en_vocab):
    """ Ensure that as_doc does not result in out-of-bound access of tokens.
    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
    # fmt: off
    words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
    heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
    deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
    # fmt: on
    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
    span2 = doc[1:5]  # "jests at scars ,"
    doc2 = span2.as_doc()
    doc2_json = doc2.to_json()
    assert doc2_json
    # head set to itself, being the new artificial root
    assert doc2[0].head.text == "jests"
    assert doc2[0].dep_ == "dep"
    assert doc2[1].head.text == "jests"
    assert doc2[1].dep_ == "prep"
    assert doc2[2].head.text == "at"
    assert doc2[2].dep_ == "pobj"
    assert doc2[3].head.text == "jests"  # head set to the new artificial root
    assert doc2[3].dep_ == "dep"
    # We should still have 1 sentence
    assert len(list(doc2.sents)) == 1
    span3 = doc[6:9]  # "never felt a"
    doc3 = span3.as_doc()
    doc3_json = doc3.to_json()
    assert doc3_json
    assert doc3[0].head.text == "felt"
    assert doc3[0].dep_ == "neg"
    assert doc3[1].head.text == "felt"
    assert doc3[1].dep_ == "ROOT"
    assert doc3[2].head.text == "felt"  # head set to ancestor
    assert doc3[2].dep_ == "dep"
    # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
    assert len(list(doc3.sents)) == 1


def test_issue3962_long(en_vocab):
    """ Ensure that as_doc does not result in out-of-bound access of tokens.
    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
    # fmt: off
    words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
    heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
    deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
    # fmt: on
    two_sent_doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
    span2 = two_sent_doc[1:7]  # "jests at scars. They never"
    doc2 = span2.as_doc()
    doc2_json = doc2.to_json()
    assert doc2_json
    # head set to itself, being the new artificial root (in sentence 1)
    assert doc2[0].head.text == "jests"
    assert doc2[0].dep_ == "ROOT"
    assert doc2[1].head.text == "jests"
    assert doc2[1].dep_ == "prep"
    assert doc2[2].head.text == "at"
    assert doc2[2].dep_ == "pobj"
    assert doc2[3].head.text == "jests"
    assert doc2[3].dep_ == "punct"
    # head set to itself, being the new artificial root (in sentence 2)
    assert doc2[4].head.text == "They"
    assert doc2[4].dep_ == "dep"
    # head set to the new artificial head (in sentence 2)
    assert doc2[4].head.text == "They"
    assert doc2[4].dep_ == "dep"
    # We should still have 2 sentences
    sents = list(doc2.sents)
    assert len(sents) == 2
    assert sents[0].text == "jests at scars ."
    assert sents[1].text == "They never"


def test_issue3972(en_vocab):
    """Test that the PhraseMatcher returns duplicates for duplicate match IDs.
    """
    matcher = PhraseMatcher(en_vocab)
    matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
    matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
    doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
    matches = matcher(doc)

    assert len(matches) == 2

    # We should have a match for each of the two rules
    found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
    assert "A" in found_ids
    assert "B" in found_ids
-												Tidy up and merge regression tests

											
										
										
											2020-07-06 12:05:59 +00:00
+								import pytest
 								from spacy.language import Language
 								from spacy.vocab import Vocab
 								from spacy.pipeline import EntityRuler, DependencyParser
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 11:42:59 +00:00
+								from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
-												Tidy up and merge regression tests

											
										
										
											2020-07-06 12:05:59 +00:00
+								from spacy import displacy, load
 								from spacy.displacy import parse_deps
 								from spacy.tokens import Doc, Token
 								from spacy.matcher import Matcher, PhraseMatcher
 								from spacy.errors import MatchPatternError
 								from spacy.util import minibatch
-												Renaming gold & annotation_setter (#6042)

* version bump to 3.0.0a16

* rename "gold" folder to "training"

* rename 'annotation_setter' to 'set_extra_annotations'

* formatting
											
										
										
											2020-09-09 08:31:03 +00:00
+								from spacy.training import Example
-												Tidy up and merge regression tests

											
										
										
											2020-07-06 12:05:59 +00:00
+								from spacy.lang.hi import Hindi
 								from spacy.lang.es import Spanish
 								from spacy.lang.en import English
 								from spacy.attrs import IS_ALPHA
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 11:42:59 +00:00
+								from spacy import registry
-												Tidy up and merge regression tests

											
										
										
											2020-07-06 12:05:59 +00:00
+								from thinc.api import compounding
 								import spacy
 								import srsly
 								import numpy
 								from ..util import make_tempdir, get_doc
 								@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
 								def test_issue3521(en_tokenizer, word):
 								    tok = en_tokenizer(word)[1]
 								    # 'not' and 'would' should be stopwords, also in their abbreviated forms
 								    assert tok.is_stop
 								def test_issue_3526_1(en_vocab):
 								    patterns = [
 								        {"label": "HELLO", "pattern": "hello world"},
 								        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
 								        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
 								        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
 								        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
 								    ]
 								    nlp = Language(vocab=en_vocab)
 								    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
 								    ruler_bytes = ruler.to_bytes()
 								    assert len(ruler) == len(patterns)
 								    assert len(ruler.labels) == 4
 								    assert ruler.overwrite
 								    new_ruler = EntityRuler(nlp)
 								    new_ruler = new_ruler.from_bytes(ruler_bytes)
 								    assert len(new_ruler) == len(ruler)
 								    assert len(new_ruler.labels) == 4
 								    assert new_ruler.overwrite == ruler.overwrite
 								    assert new_ruler.ent_id_sep == ruler.ent_id_sep
 								def test_issue_3526_2(en_vocab):
 								    patterns = [
 								        {"label": "HELLO", "pattern": "hello world"},
 								        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
 								        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
 								        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
 								        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
 								    ]
 								    nlp = Language(vocab=en_vocab)
 								    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
 								    bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
 								    new_ruler = EntityRuler(nlp)
 								    new_ruler = new_ruler.from_bytes(bytes_old_style)
 								    assert len(new_ruler) == len(ruler)
 								    for pattern in ruler.patterns:
 								        assert pattern in new_ruler.patterns
 								    assert new_ruler.overwrite is not ruler.overwrite
 								def test_issue_3526_3(en_vocab):
 								    patterns = [
 								        {"label": "HELLO", "pattern": "hello world"},
 								        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
 								        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
 								        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
 								        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
 								    ]
 								    nlp = Language(vocab=en_vocab)
 								    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
 								    with make_tempdir() as tmpdir:
 								        out_file = tmpdir / "entity_ruler"
 								        srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
 								        new_ruler = EntityRuler(nlp).from_disk(out_file)
 								        for pattern in ruler.patterns:
 								            assert pattern in new_ruler.patterns
 								        assert len(new_ruler) == len(ruler)
 								        assert new_ruler.overwrite is not ruler.overwrite
 								def test_issue_3526_4(en_vocab):
 								    nlp = Language(vocab=en_vocab)
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 11:42:59 +00:00
+								    patterns = [{"label": "ORG", "pattern": "Apple"}]
 								    config = {"overwrite_ents": True}
 								    ruler = nlp.add_pipe("entity_ruler", config=config)
 								    ruler.add_patterns(patterns)
-												Tidy up and merge regression tests

											
										
										
											2020-07-06 12:05:59 +00:00
+								    with make_tempdir() as tmpdir:
 								        nlp.to_disk(tmpdir)
 								        ruler = nlp.get_pipe("entity_ruler")
 								        assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
 								        assert ruler.overwrite is True
 								        nlp2 = load(tmpdir)
 								        new_ruler = nlp2.get_pipe("entity_ruler")
 								        assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
 								        assert new_ruler.overwrite is True
 								def test_issue3531():
 								    """Test that displaCy renderer doesn't require "settings" key."""
 								    example_dep = {
 								        "words": [
 								            {"text": "But", "tag": "CCONJ"},
 								            {"text": "Google", "tag": "PROPN"},
 								            {"text": "is", "tag": "VERB"},
 								            {"text": "starting", "tag": "VERB"},
 								            {"text": "from", "tag": "ADP"},
 								            {"text": "behind.", "tag": "ADV"},
 								        ],
 								        "arcs": [
 								            {"start": 0, "end": 3, "label": "cc", "dir": "left"},
 								            {"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
 								            {"start": 2, "end": 3, "label": "aux", "dir": "left"},
 								            {"start": 3, "end": 4, "label": "prep", "dir": "right"},
 								            {"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
 								        ],
 								    }
 								    example_ent = {
 								        "text": "But Google is starting from behind.",
 								        "ents": [{"start": 4, "end": 10, "label": "ORG"}],
 								    }
 								    dep_html = displacy.render(example_dep, style="dep", manual=True)
 								    assert dep_html
 								    ent_html = displacy.render(example_ent, style="ent", manual=True)
 								    assert ent_html
 								def test_issue3540(en_vocab):
 								    words = ["I", "live", "in", "NewYork", "right", "now"]
 								    tensor = numpy.asarray(
 								        [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
 								        dtype="f",
 								    )
 								    doc = Doc(en_vocab, words=words)
 								    doc.tensor = tensor
 								    gold_text = ["I", "live", "in", "NewYork", "right", "now"]
 								    assert [token.text for token in doc] == gold_text
 								    gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
-												Add Lemmatizer and simplify related components (#5848)

* Add Lemmatizer and simplify related components

* Add `Lemmatizer` pipe with `lookup` and `rule` modes using the
`Lookups` tables.
* Reduce `Tagger` to a simple tagger that sets `Token.tag` (no pos or lemma)
* Reduce `Morphology` to only keep track of morph tags (no tag map, lemmatizer,
or morph rules)
* Remove lemmatizer from `Vocab`
* Adjust many many tests

Differences:

* No default lookup lemmas
* No special treatment of TAG in `from_array` and similar required
* Easier to modify labels in a `Tagger`
* No extra strings added from morphology / tag map

* Fix test

* Initial fix for Lemmatizer config/serialization

* Adjust init test to be more generic

* Adjust init test to force empty Lookups

* Add simple cache to rule-based lemmatizer

* Convert language-specific lemmatizers

Convert language-specific lemmatizers to component lemmatizers. Remove
previous lemmatizer class.

* Fix French and Polish lemmatizers

* Remove outdated UPOS conversions

* Update Russian lemmatizer init in tests

* Add minimal init/run tests for custom lemmatizers

* Add option to overwrite existing lemmas

* Update mode setting, lookup loading, and caching

* Make `mode` an immutable property
* Only enforce strict `load_lookups` for known supported modes
* Move caching into individual `_lemmatize` methods

* Implement strict when lang is not found in lookups

* Fix tables/lookups in make_lemmatizer

* Reallow provided lookups and allow for stricter checks

* Add lookups asset to all Lemmatizer pipe tests

* Rename lookups in lemmatizer init test

* Clean up merge

* Refactor lookup table loading

* Add helper from `load_lemmatizer_lookups` that loads required and
optional lookups tables based on settings provided by a config.

Additional slight refactor of lookups:

* Add `Lookups.set_table` to set a table from a provided `Table`
* Reorder class definitions to be able to specify type as `Table`

* Move registry assets into test methods

* Refactor lookups tables config

Use class methods within `Lemmatizer` to provide the config for
particular modes and to load the lookups from a config.

* Add pipe and score to lemmatizer

* Simplify Tagger.score

* Add missing import

* Clean up imports and auto-format

* Remove unused kwarg

* Tidy up and auto-format

* Update docstrings for Lemmatizer

Update docstrings for Lemmatizer.

Additionally modify `is_base_form` API to take `Token` instead of
individual features.

* Update docstrings

* Remove tag map values from Tagger.add_label

* Update API docs

* Fix relative link in Lemmatizer API docs
											
										
										
											2020-08-07 13:27:13 +00:00
+								    for i, lemma in enumerate(gold_lemma):
 								        doc[i].lemma_ = lemma
-												Tidy up and merge regression tests

											
										
										
											2020-07-06 12:05:59 +00:00
+								    assert [token.lemma_ for token in doc] == gold_lemma
 								    vectors_1 = [token.vector for token in doc]
 								    assert len(vectors_1) == len(doc)
 								    with doc.retokenize() as retokenizer:
 								        heads = [(doc[3], 1), doc[2]]
-												Tidy up and auto-format

											
										
										
											2020-08-09 20:36:23 +00:00
+								        attrs = {
 								            "POS": ["PROPN", "PROPN"],
 								            "LEMMA": ["New", "York"],
 								            "DEP": ["pobj", "compound"],
 								        }
-												Tidy up and merge regression tests

											
										
										
											2020-07-06 12:05:59 +00:00
+								        retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
 								    gold_text = ["I", "live", "in", "New", "York", "right", "now"]
 								    assert [token.text for token in doc] == gold_text
 								    gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
 								    assert [token.lemma_ for token in doc] == gold_lemma
 								    vectors_2 = [token.vector for token in doc]
 								    assert len(vectors_2) == len(doc)
 								    assert vectors_1[0].tolist() == vectors_2[0].tolist()
 								    assert vectors_1[1].tolist() == vectors_2[1].tolist()
 								    assert vectors_1[2].tolist() == vectors_2[2].tolist()
 								    assert vectors_1[4].tolist() == vectors_2[5].tolist()
 								    assert vectors_1[5].tolist() == vectors_2[6].tolist()
 								def test_issue3549(en_vocab):
 								    """Test that match pattern validation doesn't raise on empty errors."""
 								    matcher = Matcher(en_vocab, validate=True)
 								    pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
 								    matcher.add("GOOD", [pattern])
 								    with pytest.raises(MatchPatternError):
 								        matcher.add("BAD", [[{"X": "Y"}]])
-												Test suite clean up (#5781)

* step_through tests: skip instead of xfail

* test_empty_doc should be fixed with new Thinc version

* remove outdated test (there are other misaligned tests now)

* xfail reason

* fix test according to french exceptions

* clarified some skipped tests

* skip ukranian test instead of xfail

* skip instead of xfail

* skip + reason instead of xfail

* removed obsolete tests referring to removed "set_frozen" functionality

* fix test 999

* remove unused AlignmentError

* remove xfail where possible, skip otherwise

* increment thinc release for empty_doc test
											
										
										
											2020-07-20 12:49:54 +00:00
+								@pytest.mark.skip("Matching currently only works on strings and integers")
-												Tidy up and merge regression tests

											
										
										
											2020-07-06 12:05:59 +00:00
+								def test_issue3555(en_vocab):
 								    """Test that custom extensions with default None don't break matcher."""
 								    Token.set_extension("issue3555", default=None)
 								    matcher = Matcher(en_vocab)
-												Test suite clean up (#5781)

* step_through tests: skip instead of xfail

* test_empty_doc should be fixed with new Thinc version

* remove outdated test (there are other misaligned tests now)

* xfail reason

* fix test according to french exceptions

* clarified some skipped tests

* skip ukranian test instead of xfail

* skip instead of xfail

* skip + reason instead of xfail

* removed obsolete tests referring to removed "set_frozen" functionality

* fix test 999

* remove unused AlignmentError

* remove xfail where possible, skip otherwise

* increment thinc release for empty_doc test
											
										
										
											2020-07-20 12:49:54 +00:00
+								    pattern = [{"ORTH": "have"}, {"_": {"issue3555": True}}]
-												Tidy up and merge regression tests

											
										
										
											2020-07-06 12:05:59 +00:00
+								    matcher.add("TEST", [pattern])
 								    doc = Doc(en_vocab, words=["have", "apple"])
 								    matcher(doc)
 								def test_issue3611():
 								    """ Test whether adding n-grams in the textcat works even when n > token length of some docs """
 								    unique_classes = ["offensive", "inoffensive"]
 								    x_train = [
 								        "This is an offensive text",
 								        "This is the second offensive text",
 								        "inoff",
 								    ]
 								    y_train = ["offensive", "offensive", "inoffensive"]
 								    nlp = spacy.blank("en")
 								    # preparing the data
 								    train_data = []
 								    for text, train_instance in zip(x_train, y_train):
 								        cat_dict = {label: label == train_instance for label in unique_classes}
 								        train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
 								    # add a text categorizer component
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 11:42:59 +00:00
+								    model = {
 								        "@architectures": "spacy.TextCatBOW.v1",
 								        "exclusive_classes": True,
 								        "ngram_size": 2,
 								        "no_output_layer": False,
 								    }
 								    textcat = nlp.add_pipe("textcat", config={"model": model}, last=True)
-												Tidy up and merge regression tests

											
										
										
											2020-07-06 12:05:59 +00:00
+								    for label in unique_classes:
 								        textcat.add_label(label)
 								    # training the network
 								    with nlp.select_pipes(enable="textcat"):
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 11:42:59 +00:00
+								        optimizer = nlp.begin_training()
-												Tidy up and merge regression tests

											
										
										
											2020-07-06 12:05:59 +00:00
+								        for i in range(3):
 								            losses = {}
 								            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
 								            for batch in batches:
 								                nlp.update(
 								                    examples=batch, sgd=optimizer, drop=0.1, losses=losses,
 								                )
 								def test_issue3625():
 								    """Test that default punctuation rules applies to hindi unicode characters"""
 								    nlp = Hindi()
 								    doc = nlp("hi. how हुए. होटल, होटल")
 								    expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
 								    assert [token.text for token in doc] == expected
 								def test_issue3803():
 								    """Test that spanish num-like tokens have True for like_num attribute."""
 								    nlp = Spanish()
 								    text = "2 dos 1000 mil 12 doce"
 								    doc = nlp(text)
 								    assert [t.like_num for t in doc] == [True, True, True, True, True, True]
-												Pipe API (#6034)

* ensure Language passes on valid examples for initialization

* fix tagger model initialization

* check for valid get_examples across components

* assume labels were added before begin_training

* fix senter initialization

* fix morphologizer initialization

* use methods to check arguments

* test textcat init, requires thinc>=8.0.0a31

* fix tok2vec init

* fix entity linker init

* use islice

* fix simple NER

* cleanup debug model

* fix assert statements

* fix tests

* throw error when adding a label if the output layer can't be resized anymore

* fix test

* add failing test for simple_ner

* UX improvements

* morphologizer UX

* assume begin_training gets a representative set and processes the labels

* remove assumptions for output of untrained NER model

* restore test for original purpose
											
										
										
											2020-09-08 20:44:25 +00:00
+								def _parser_example(parser):
 								    doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
 								    gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]}
 								    return Example.from_dict(doc, gold)
-												Tidy up and merge regression tests

											
										
										
											2020-07-06 12:05:59 +00:00
+								def test_issue3830_no_subtok():
 								    """Test that the parser doesn't have subtok label if not learn_tokens"""
 								    config = {
 								        "learn_tokens": False,
 								        "min_action_freq": 30,
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 11:42:59 +00:00
+								        "update_with_oracle_cut_size": 100,
-												Tidy up and merge regression tests

											
										
										
											2020-07-06 12:05:59 +00:00
+								    }
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 11:42:59 +00:00
+								    model = registry.make_from_config({"model": DEFAULT_PARSER_MODEL}, validate=True)[
 								        "model"
 								    ]
 								    parser = DependencyParser(Vocab(), model, **config)
-												Tidy up and merge regression tests

											
										
										
											2020-07-06 12:05:59 +00:00
+								    parser.add_label("nsubj")
 								    assert "subtok" not in parser.labels
-												Pipe API (#6034)

* ensure Language passes on valid examples for initialization

* fix tagger model initialization

* check for valid get_examples across components

* assume labels were added before begin_training

* fix senter initialization

* fix morphologizer initialization

* use methods to check arguments

* test textcat init, requires thinc>=8.0.0a31

* fix tok2vec init

* fix entity linker init

* use islice

* fix simple NER

* cleanup debug model

* fix assert statements

* fix tests

* throw error when adding a label if the output layer can't be resized anymore

* fix test

* add failing test for simple_ner

* UX improvements

* morphologizer UX

* assume begin_training gets a representative set and processes the labels

* remove assumptions for output of untrained NER model

* restore test for original purpose
											
										
										
											2020-09-08 20:44:25 +00:00
+								    parser.begin_training(lambda: [_parser_example(parser)])
-												Tidy up and merge regression tests

											
										
										
											2020-07-06 12:05:59 +00:00
+								    assert "subtok" not in parser.labels
 								def test_issue3830_with_subtok():
 								    """Test that the parser does have subtok label if learn_tokens=True."""
 								    config = {
 								        "learn_tokens": True,
 								        "min_action_freq": 30,
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 11:42:59 +00:00
+								        "update_with_oracle_cut_size": 100,
-												Tidy up and merge regression tests

											
										
										
											2020-07-06 12:05:59 +00:00
+								    }
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 11:42:59 +00:00
+								    model = registry.make_from_config({"model": DEFAULT_PARSER_MODEL}, validate=True)[
 								        "model"
 								    ]
 								    parser = DependencyParser(Vocab(), model, **config)
-												Tidy up and merge regression tests

											
										
										
											2020-07-06 12:05:59 +00:00
+								    parser.add_label("nsubj")
 								    assert "subtok" not in parser.labels
-												Pipe API (#6034)

* ensure Language passes on valid examples for initialization

* fix tagger model initialization

* check for valid get_examples across components

* assume labels were added before begin_training

* fix senter initialization

* fix morphologizer initialization

* use methods to check arguments

* test textcat init, requires thinc>=8.0.0a31

* fix tok2vec init

* fix entity linker init

* use islice

* fix simple NER

* cleanup debug model

* fix assert statements

* fix tests

* throw error when adding a label if the output layer can't be resized anymore

* fix test

* add failing test for simple_ner

* UX improvements

* morphologizer UX

* assume begin_training gets a representative set and processes the labels

* remove assumptions for output of untrained NER model

* restore test for original purpose
											
										
										
											2020-09-08 20:44:25 +00:00
+								    parser.begin_training(lambda: [_parser_example(parser)])
-												Tidy up and merge regression tests

											
										
										
											2020-07-06 12:05:59 +00:00
+								    assert "subtok" in parser.labels
 								def test_issue3839(en_vocab):
 								    """Test that match IDs returned by the matcher are correct, are in the string """
 								    doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
 								    matcher = Matcher(en_vocab)
 								    match_id = "PATTERN"
 								    pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
 								    pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
 								    matcher.add(match_id, [pattern1])
 								    matches = matcher(doc)
 								    assert matches[0][0] == en_vocab.strings[match_id]
 								    matcher = Matcher(en_vocab)
 								    matcher.add(match_id, [pattern2])
 								    matches = matcher(doc)
 								    assert matches[0][0] == en_vocab.strings[match_id]
 								@pytest.mark.parametrize(
 								    "sentence",
 								    [
 								        "The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
 								        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
 								        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
 								        "Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
 								        "It was a missed assignment, but it shouldn't have resulted in a turnover ...",
 								    ],
 								)
 								def test_issue3869(sentence):
 								    """Test that the Doc's count_by function works consistently"""
 								    nlp = English()
 								    doc = nlp(sentence)
 								    count = 0
 								    for token in doc:
 								        count += token.is_alpha
 								    assert count == doc.count_by(IS_ALPHA).get(1, 0)
 								def test_issue3879(en_vocab):
 								    doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
 								    assert len(doc) == 5
 								    pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
 								    matcher = Matcher(en_vocab)
 								    matcher.add("TEST", [pattern])
 								    assert len(matcher(doc)) == 2  # fails because of a FP match 'is a test'
 								def test_issue3880():
 								    """Test that `nlp.pipe()` works when an empty string ends the batch.
 								    Fixed in v7.0.5 of Thinc.
 								    """
 								    texts = ["hello", "world", "", ""]
 								    nlp = English()
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 11:42:59 +00:00
+								    nlp.add_pipe("parser").add_label("dep")
 								    nlp.add_pipe("ner").add_label("PERSON")
 								    nlp.add_pipe("tagger").add_label("NN")
-												Tidy up and merge regression tests

											
										
										
											2020-07-06 12:05:59 +00:00
+								    nlp.begin_training()
 								    for doc in nlp.pipe(texts):
 								        pass
 								def test_issue3882(en_vocab):
 								    """Test that displaCy doesn't serialize the doc.user_data when making a
 								    copy of the Doc.
 								    """
 								    doc = Doc(en_vocab, words=["Hello", "world"])
 								    doc.user_data["test"] = set()
 								    parse_deps(doc)
 								def test_issue3951(en_vocab):
 								    """Test that combinations of optional rules are matched correctly."""
 								    matcher = Matcher(en_vocab)
 								    pattern = [
 								        {"LOWER": "hello"},
 								        {"LOWER": "this", "OP": "?"},
 								        {"OP": "?"},
 								        {"LOWER": "world"},
 								    ]
 								    matcher.add("TEST", [pattern])
 								    doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
 								    matches = matcher(doc)
 								    assert len(matches) == 0
 								def test_issue3959():
 								    """ Ensure that a modified pos attribute is serialized correctly."""
 								    nlp = English()
 								    doc = nlp(
 								        "displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
 								    )
 								    assert doc[0].pos_ == ""
 								    doc[0].pos_ = "NOUN"
 								    assert doc[0].pos_ == "NOUN"
 								    # usually this is already True when starting from proper models instead of blank English
 								    with make_tempdir() as tmp_dir:
 								        file_path = tmp_dir / "my_doc"
 								        doc.to_disk(file_path)
 								        doc2 = nlp("")
 								        doc2.from_disk(file_path)
 								        assert doc2[0].pos_ == "NOUN"
 								def test_issue3962(en_vocab):
 								    """ Ensure that as_doc does not result in out-of-bound access of tokens.
 								    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
 								    # fmt: off
 								    words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
 								    heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
 								    deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
 								    # fmt: on
 								    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
 								    span2 = doc[1:5]  # "jests at scars ,"
 								    doc2 = span2.as_doc()
 								    doc2_json = doc2.to_json()
 								    assert doc2_json
 								    # head set to itself, being the new artificial root
 								    assert doc2[0].head.text == "jests"
 								    assert doc2[0].dep_ == "dep"
 								    assert doc2[1].head.text == "jests"
 								    assert doc2[1].dep_ == "prep"
 								    assert doc2[2].head.text == "at"
 								    assert doc2[2].dep_ == "pobj"
 								    assert doc2[3].head.text == "jests"  # head set to the new artificial root
 								    assert doc2[3].dep_ == "dep"
 								    # We should still have 1 sentence
 								    assert len(list(doc2.sents)) == 1
 								    span3 = doc[6:9]  # "never felt a"
 								    doc3 = span3.as_doc()
 								    doc3_json = doc3.to_json()
 								    assert doc3_json
 								    assert doc3[0].head.text == "felt"
 								    assert doc3[0].dep_ == "neg"
 								    assert doc3[1].head.text == "felt"
 								    assert doc3[1].dep_ == "ROOT"
 								    assert doc3[2].head.text == "felt"  # head set to ancestor
 								    assert doc3[2].dep_ == "dep"
 								    # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
 								    assert len(list(doc3.sents)) == 1
 								def test_issue3962_long(en_vocab):
 								    """ Ensure that as_doc does not result in out-of-bound access of tokens.
 								    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
 								    # fmt: off
 								    words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
 								    heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
 								    deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
 								    # fmt: on
 								    two_sent_doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
 								    span2 = two_sent_doc[1:7]  # "jests at scars. They never"
 								    doc2 = span2.as_doc()
 								    doc2_json = doc2.to_json()
 								    assert doc2_json
 								    # head set to itself, being the new artificial root (in sentence 1)
 								    assert doc2[0].head.text == "jests"
 								    assert doc2[0].dep_ == "ROOT"
 								    assert doc2[1].head.text == "jests"
 								    assert doc2[1].dep_ == "prep"
 								    assert doc2[2].head.text == "at"
 								    assert doc2[2].dep_ == "pobj"
 								    assert doc2[3].head.text == "jests"
 								    assert doc2[3].dep_ == "punct"
 								    # head set to itself, being the new artificial root (in sentence 2)
 								    assert doc2[4].head.text == "They"
 								    assert doc2[4].dep_ == "dep"
 								    # head set to the new artificial head (in sentence 2)
 								    assert doc2[4].head.text == "They"
 								    assert doc2[4].dep_ == "dep"
 								    # We should still have 2 sentences
 								    sents = list(doc2.sents)
 								    assert len(sents) == 2
 								    assert sents[0].text == "jests at scars ."
 								    assert sents[1].text == "They never"
 								def test_issue3972(en_vocab):
 								    """Test that the PhraseMatcher returns duplicates for duplicate match IDs.
 								    """
 								    matcher = PhraseMatcher(en_vocab)
 								    matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
 								    matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
 								    doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
 								    matches = matcher(doc)
 								    assert len(matches) == 2
 								    # We should have a match for each of the two rules
 								    found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
 								    assert "A" in found_ids
 								    assert "B" in found_ids