From 1543558d0805c78be6f4fac04fb5c764d8daa20f Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 10 May 2022 08:24:42 +0200 Subject: [PATCH] Add test for old architectures (#10751) * add v1 and v2 tests for tok2vec architectures * textcat architectures are not "layers" * test older textcat architectures * test older parser architecture --- spacy/tests/parser/test_parse.py | 29 ++++++++++ spacy/tests/pipeline/test_textcat.py | 19 ++++++- spacy/tests/pipeline/test_tok2vec.py | 39 +++++++++---- website/docs/api/legacy.md | 85 ++++++++++++++-------------- 4 files changed, 119 insertions(+), 53 deletions(-) diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 7bbb30d8e..aaf31ed56 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -12,6 +12,7 @@ from spacy.vocab import Vocab from ...pipeline import DependencyParser from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL from ..util import apply_transition_sequence, make_tempdir +from ...pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL TRAIN_DATA = [ ( @@ -395,6 +396,34 @@ def test_overfitting_IO(pipe_name): assert_equal(batch_deps_1, no_batch_deps) +# fmt: off +@pytest.mark.slow +@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"]) +@pytest.mark.parametrize( + "parser_config", + [ + # TransitionBasedParser V1 + ({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}), + # TransitionBasedParser V2 + ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}), + ], +) +# fmt: on +def test_parser_configs(pipe_name, parser_config): + pipe_config = {"model": parser_config} + nlp = English() + parser = nlp.add_pipe(pipe_name, config=pipe_config) + train_examples = [] + for text, annotations in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) + for dep in annotations.get("deps", []): + parser.add_label(dep) + optimizer = nlp.initialize() + for i in range(5): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + + def test_beam_parser_scores(): # Test that we can get confidence values out of the beam_parser pipe beam_width = 16 diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 798dd165e..0bb036a33 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -382,6 +382,7 @@ def test_implicit_label(name, get_examples): # fmt: off +@pytest.mark.slow @pytest.mark.parametrize( "name,textcat_config", [ @@ -390,7 +391,10 @@ def test_implicit_label(name, get_examples): ("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}), - # ENSEMBLE + # ENSEMBLE V1 + ("textcat", {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": False, "pretrained_vectors": None, "width": 64, "embed_size": 2000, "conv_depth": 2, "window_size": 1, "ngram_size": 1, "dropout": None}), + ("textcat_multilabel", {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": False, "pretrained_vectors": None, "width": 64, "embed_size": 2000, "conv_depth": 2, "window_size": 1, "ngram_size": 1, "dropout": None}), + # ENSEMBLE V2 ("textcat", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}}), ("textcat", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}}), ("textcat_multilabel", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}}), @@ -643,15 +647,28 @@ def test_overfitting_IO_multi(): # fmt: off +@pytest.mark.slow @pytest.mark.parametrize( "name,train_data,textcat_config", [ + # BOW V1 + ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}), + ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}), + # ENSEMBLE V1 + ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": False, "pretrained_vectors": None, "width": 64, "embed_size": 2000, "conv_depth": 2, "window_size": 1, "ngram_size": 1, "dropout": None}), + ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": False, "pretrained_vectors": None, "width": 64, "embed_size": 2000, "conv_depth": 2, "window_size": 1, "ngram_size": 1, "dropout": None}), + # CNN V1 + ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), + ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), + # BOW V2 ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}), ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}), ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}), ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}), + # ENSEMBLE V2 ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}), ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}), + # CNN V2 ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), ], diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index 37104c78a..64faf133d 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -1,13 +1,13 @@ import pytest from spacy.ml.models.tok2vec import build_Tok2Vec_model -from spacy.ml.models.tok2vec import MultiHashEmbed, CharacterEmbed -from spacy.ml.models.tok2vec import MishWindowEncoder, MaxoutWindowEncoder +from spacy.ml.models.tok2vec import MultiHashEmbed, MaxoutWindowEncoder from spacy.pipeline.tok2vec import Tok2Vec, Tok2VecListener from spacy.vocab import Vocab from spacy.tokens import Doc from spacy.training import Example from spacy import util from spacy.lang.en import English +from spacy.util import registry from thinc.api import Config, get_current_ops from numpy.testing import assert_array_equal @@ -55,24 +55,41 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): assert doc_vec.shape == (len(doc), width) +@pytest.mark.slow +@pytest.mark.parametrize("width", [8]) @pytest.mark.parametrize( - "width,embed_arch,embed_config,encode_arch,encode_config", + "embed_arch,embed_config", # fmt: off [ - (8, MultiHashEmbed, {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}), - (8, MultiHashEmbed, {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}), - (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}), - (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}), + ("spacy.MultiHashEmbed.v1", {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}), + ("spacy.MultiHashEmbed.v1", {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}), + ("spacy.CharacterEmbed.v1", {"rows": 100, "nM": 64, "nC": 8, "include_static_vectors": False}), + ("spacy.CharacterEmbed.v1", {"rows": 100, "nM": 16, "nC": 2, "include_static_vectors": False}), ], # fmt: on ) -def test_tok2vec_configs(width, embed_arch, embed_config, encode_arch, encode_config): +@pytest.mark.parametrize( + "tok2vec_arch,encode_arch,encode_config", + # fmt: off + [ + ("spacy.Tok2Vec.v1", "spacy.MaxoutWindowEncoder.v1", {"window_size": 1, "maxout_pieces": 3, "depth": 2}), + ("spacy.Tok2Vec.v2", "spacy.MaxoutWindowEncoder.v2", {"window_size": 1, "maxout_pieces": 3, "depth": 2}), + ("spacy.Tok2Vec.v1", "spacy.MishWindowEncoder.v1", {"window_size": 1, "depth": 6}), + ("spacy.Tok2Vec.v2", "spacy.MishWindowEncoder.v2", {"window_size": 1, "depth": 6}), + ], + # fmt: on +) +def test_tok2vec_configs( + width, tok2vec_arch, embed_arch, embed_config, encode_arch, encode_config +): + embed = registry.get("architectures", embed_arch) + encode = registry.get("architectures", encode_arch) + tok2vec_model = registry.get("architectures", tok2vec_arch) + embed_config["width"] = width encode_config["width"] = width docs = get_batch(3) - tok2vec = build_Tok2Vec_model( - embed_arch(**embed_config), encode_arch(**encode_config) - ) + tok2vec = tok2vec_model(embed(**embed_config), encode(**encode_config)) tok2vec.initialize(docs) vectors, backprop = tok2vec.begin_update(docs) assert len(vectors) == len(docs) diff --git a/website/docs/api/legacy.md b/website/docs/api/legacy.md index e24c37d77..31d178b67 100644 --- a/website/docs/api/legacy.md +++ b/website/docs/api/legacy.md @@ -103,11 +103,22 @@ and residual connections. | `depth` | The number of convolutional layers. Recommended value is `4`. ~~int~~ | | **CREATES** | The model using the architecture. ~~Model[Floats2d, Floats2d]~~ | -### spacy.TransitionBasedParser.v1 {#TransitionBasedParser_v1} +### spacy.HashEmbedCNN.v1 {#HashEmbedCNN_v1} -Identical to -[`spacy.TransitionBasedParser.v2`](/api/architectures#TransitionBasedParser) -except the `use_upper` was set to `True` by default. +Identical to [`spacy.HashEmbedCNN.v2`](/api/architectures#HashEmbedCNN) except +using [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are included. + +### spacy.MultiHashEmbed.v1 {#MultiHashEmbed_v1} + +Identical to [`spacy.MultiHashEmbed.v2`](/api/architectures#MultiHashEmbed) +except with [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are +included. + +### spacy.CharacterEmbed.v1 {#CharacterEmbed_v1} + +Identical to [`spacy.CharacterEmbed.v2`](/api/architectures#CharacterEmbed) +except using [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are +included. ### spacy.TextCatEnsemble.v1 {#TextCatEnsemble_v1} @@ -147,41 +158,6 @@ network has an internal CNN Tok2Vec layer and uses attention. | `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | -### spacy.HashEmbedCNN.v1 {#HashEmbedCNN_v1} - -Identical to [`spacy.HashEmbedCNN.v2`](/api/architectures#HashEmbedCNN) except -using [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are included. - -### spacy.MultiHashEmbed.v1 {#MultiHashEmbed_v1} - -Identical to [`spacy.MultiHashEmbed.v2`](/api/architectures#MultiHashEmbed) -except with [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are -included. - -### spacy.CharacterEmbed.v1 {#CharacterEmbed_v1} - -Identical to [`spacy.CharacterEmbed.v2`](/api/architectures#CharacterEmbed) -except using [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are -included. - -## Layers {#layers} - -These functions are available from `@spacy.registry.layers`. - -### spacy.StaticVectors.v1 {#StaticVectors_v1} - -Identical to [`spacy.StaticVectors.v2`](/api/architectures#StaticVectors) except -for the handling of tokens without vectors. - - - -`spacy.StaticVectors.v1` maps tokens without vectors to the final row in the -vectors table, which causes the model predictions to change if new vectors are -added to an existing vectors table. See more details in -[issue #7662](https://github.com/explosion/spaCy/issues/7662#issuecomment-813925655). - - - ### spacy.TextCatCNN.v1 {#TextCatCNN_v1} Since `spacy.TextCatCNN.v2`, this architecture has become resizable, which means @@ -246,8 +222,35 @@ the others, but may not be as accurate, especially if texts are short. | `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | +### spacy.TransitionBasedParser.v1 {#TransitionBasedParser_v1} + +Identical to +[`spacy.TransitionBasedParser.v2`](/api/architectures#TransitionBasedParser) +except the `use_upper` was set to `True` by default. + +## Layers {#layers} + +These functions are available from `@spacy.registry.layers`. + +### spacy.StaticVectors.v1 {#StaticVectors_v1} + +Identical to [`spacy.StaticVectors.v2`](/api/architectures#StaticVectors) except +for the handling of tokens without vectors. + + + +`spacy.StaticVectors.v1` maps tokens without vectors to the final row in the +vectors table, which causes the model predictions to change if new vectors are +added to an existing vectors table. See more details in +[issue #7662](https://github.com/explosion/spaCy/issues/7662#issuecomment-813925655). + + + ## Loggers {#loggers} -Logging utilities for spaCy are implemented in the [`spacy-loggers`](https://github.com/explosion/spacy-loggers) repo, and the functions are typically available from `@spacy.registry.loggers`. +Logging utilities for spaCy are implemented in the +[`spacy-loggers`](https://github.com/explosion/spacy-loggers) repo, and the +functions are typically available from `@spacy.registry.loggers`. -More documentation can be found in that repo's [readme](https://github.com/explosion/spacy-loggers/blob/main/README.md) file. +More documentation can be found in that repo's +[readme](https://github.com/explosion/spacy-loggers/blob/main/README.md) file.