From 5847be6022e615cdea55ca5a7856d203254e7ddf Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Sun, 8 Mar 2020 13:23:18 +0100 Subject: [PATCH] Tok2Vec: extract-embed-encode (#5102) * avoid changing original config * fix elif structure, batch with just int crashes otherwise * tok2vec example with doc2feats, encode and embed architectures * further clean up MultiHashEmbed * further generalize Tok2Vec to work with extract-embed-encode parts * avoid initializing the charembed layer with Docs (for now ?) * small fixes for bilstm config (still does not run) * rename to core layer * move new configs * walk model to set nI instead of using core ref * fix senter overfitting test to be more similar to the training data (avoid flakey behaviour) --- .../ptb-joint-pos-dep/bilstm_tok2vec.cfg | 2 +- .../tok2vec-ner/charembed_tok2vec.cfg | 65 ++++++ .../tok2vec-ner/multihashembed_tok2vec.cfg | 65 ++++++ spacy/language.py | 9 +- spacy/ml/_character_embed.py | 2 +- spacy/ml/models/tok2vec.py | 199 +++++++----------- spacy/ml/tok2vec.py | 0 spacy/pipeline/tok2vec.py | 7 +- spacy/tests/pipeline/test_senter.py | 12 +- spacy/util.py | 7 +- 10 files changed, 227 insertions(+), 141 deletions(-) create mode 100644 examples/experiments/tok2vec-ner/charembed_tok2vec.cfg create mode 100644 examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg delete mode 100644 spacy/ml/tok2vec.py diff --git a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg index 4f1a915c5..b6b4e82b6 100644 --- a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg +++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg @@ -62,4 +62,4 @@ width = 96 depth = 4 embed_size = 2000 subword_features = true -char_embed = false +maxout_pieces = 3 diff --git a/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg b/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg new file mode 100644 index 000000000..b8219ad10 --- /dev/null +++ b/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg @@ -0,0 +1,65 @@ +[training] +use_gpu = -1 +limit = 0 +dropout = 0.2 +patience = 10000 +eval_frequency = 200 +scores = ["ents_f"] +score_weights = {"ents_f": 1} +orth_variant_level = 0.0 +gold_preproc = true +max_length = 0 +batch_size = 25 + +[optimizer] +@optimizers = "Adam.v1" +learn_rate = 0.001 +beta1 = 0.9 +beta2 = 0.999 + +[nlp] +lang = "en" +vectors = null + +[nlp.pipeline.tok2vec] +factory = "tok2vec" + +[nlp.pipeline.tok2vec.model] +@architectures = "spacy.Tok2Vec.v1" + +[nlp.pipeline.tok2vec.model.extract] +@architectures = "spacy.CharacterEmbed.v1" +width = 96 +nM = 64 +nC = 8 +rows = 2000 +columns = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"] + +[nlp.pipeline.tok2vec.model.extract.features] +@architectures = "spacy.Doc2Feats.v1" +columns = ${nlp.pipeline.tok2vec.model.extract:columns} + +[nlp.pipeline.tok2vec.model.embed] +@architectures = "spacy.LayerNormalizedMaxout.v1" +width = ${nlp.pipeline.tok2vec.model.extract:width} +maxout_pieces = 4 + +[nlp.pipeline.tok2vec.model.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = ${nlp.pipeline.tok2vec.model.extract:width} +window_size = 1 +maxout_pieces = 2 +depth = 2 + +[nlp.pipeline.ner] +factory = "ner" + +[nlp.pipeline.ner.model] +@architectures = "spacy.TransitionBasedParser.v1" +nr_feature_tokens = 6 +hidden_width = 64 +maxout_pieces = 2 + +[nlp.pipeline.ner.model.tok2vec] +@architectures = "spacy.Tok2VecTensors.v1" +width = ${nlp.pipeline.tok2vec.model.extract:width} diff --git a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg new file mode 100644 index 000000000..4678a7d6b --- /dev/null +++ b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg @@ -0,0 +1,65 @@ +[training] +use_gpu = -1 +limit = 0 +dropout = 0.2 +patience = 10000 +eval_frequency = 200 +scores = ["ents_f"] +score_weights = {"ents_f": 1} +orth_variant_level = 0.0 +gold_preproc = true +max_length = 0 +batch_size = 25 + +[optimizer] +@optimizers = "Adam.v1" +learn_rate = 0.001 +beta1 = 0.9 +beta2 = 0.999 + +[nlp] +lang = "en" +vectors = null + +[nlp.pipeline.tok2vec] +factory = "tok2vec" + +[nlp.pipeline.tok2vec.model] +@architectures = "spacy.Tok2Vec.v1" + +[nlp.pipeline.tok2vec.model.extract] +@architectures = "spacy.Doc2Feats.v1" +columns = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"] + +[nlp.pipeline.tok2vec.model.embed] +@architectures = "spacy.MultiHashEmbed.v1" +columns = ${nlp.pipeline.tok2vec.model.extract:columns} +width = 96 +rows = 2000 +use_subwords = true +pretrained_vectors = null + +[nlp.pipeline.tok2vec.model.embed.mix] +@architectures = "spacy.LayerNormalizedMaxout.v1" +width = ${nlp.pipeline.tok2vec.model.embed:width} +maxout_pieces = 3 + +[nlp.pipeline.tok2vec.model.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = ${nlp.pipeline.tok2vec.model.embed:width} +window_size = 1 +maxout_pieces = 3 +depth = 2 + +[nlp.pipeline.ner] +factory = "ner" + +[nlp.pipeline.ner.model] +@architectures = "spacy.TransitionBasedParser.v1" +nr_feature_tokens = 6 +hidden_width = 64 +maxout_pieces = 2 + +[nlp.pipeline.ner.model.tok2vec] +@architectures = "spacy.Tok2VecTensors.v1" +width = ${nlp.pipeline.tok2vec.model.embed:width} diff --git a/spacy/language.py b/spacy/language.py index d0077b9d2..20e29c829 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -337,13 +337,14 @@ class Language(object): default_config = self.defaults.get(name, None) # transform the model's config to an actual Model + factory_cfg = dict(config) model_cfg = None - if "model" in config: - model_cfg = config["model"] + if "model" in factory_cfg: + model_cfg = factory_cfg["model"] if not isinstance(model_cfg, dict): warnings.warn(Warnings.W099.format(type=type(model_cfg), pipe=name)) model_cfg = None - del config["model"] + del factory_cfg["model"] if model_cfg is None and default_config is not None: warnings.warn(Warnings.W098.format(name=name)) model_cfg = default_config["model"] @@ -353,7 +354,7 @@ class Language(object): model = registry.make_from_config({"model": model_cfg}, validate=True)[ "model" ] - return factory(self, model, **config) + return factory(self, model, **factory_cfg) def add_pipe( self, component, name=None, before=None, after=None, first=None, last=None diff --git a/spacy/ml/_character_embed.py b/spacy/ml/_character_embed.py index b366f67c6..f4890144a 100644 --- a/spacy/ml/_character_embed.py +++ b/spacy/ml/_character_embed.py @@ -21,7 +21,7 @@ def init(model, X=None, Y=None): def forward(model, docs, is_train): - if not docs: + if docs is None: return [] ids = [] output = [] diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 0d33d010d..d1a98c080 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -4,7 +4,7 @@ from thinc.api import HashEmbed, StaticVectors, PyTorchLSTM from thinc.api import residual, LayerNorm, FeatureExtractor, Mish from ... import util -from ...util import registry, make_layer +from ...util import registry from ...ml import _character_embed from ...pipeline.tok2vec import Tok2VecListener from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE @@ -23,15 +23,14 @@ def get_vocab_vectors(name): @registry.architectures.register("spacy.Tok2Vec.v1") -def Tok2Vec(config): - doc2feats = make_layer(config["@doc2feats"]) - embed = make_layer(config["@embed"]) - encode = make_layer(config["@encode"]) +def Tok2Vec(extract, embed, encode): field_size = 0 - if encode.has_attr("receptive_field"): + if encode.attrs.get("receptive_field", None): field_size = encode.attrs["receptive_field"] - tok2vec = chain(doc2feats, with_array(chain(embed, encode), pad=field_size)) - tok2vec.attrs["cfg"] = config + with Model.define_operators({">>": chain, "|": concatenate}): + if extract.has_dim("nO"): + _set_dims(embed, "nI", extract.get_dim("nO")) + tok2vec = extract >> with_array(embed >> encode, pad=field_size) tok2vec.set_dim("nO", encode.get_dim("nO")) tok2vec.set_ref("embed", embed) tok2vec.set_ref("encode", encode) @@ -39,8 +38,7 @@ def Tok2Vec(config): @registry.architectures.register("spacy.Doc2Feats.v1") -def Doc2Feats(config): - columns = config["columns"] +def Doc2Feats(columns): return FeatureExtractor(columns) @@ -79,8 +77,8 @@ def hash_charembed_cnn( maxout_pieces, window_size, subword_features, - nM=0, - nC=0, + nM, + nC, ): # Allows using character embeddings by setting nC, nM and char_embed=True return build_Tok2Vec_model( @@ -100,7 +98,7 @@ def hash_charembed_cnn( @registry.architectures.register("spacy.HashEmbedBiLSTM.v1") def hash_embed_bilstm_v1( - pretrained_vectors, width, depth, embed_size, subword_features + pretrained_vectors, width, depth, embed_size, subword_features, maxout_pieces ): # Does not use character embeddings: set to False by default return build_Tok2Vec_model( @@ -109,7 +107,7 @@ def hash_embed_bilstm_v1( pretrained_vectors=pretrained_vectors, bilstm_depth=depth, conv_depth=0, - maxout_pieces=0, + maxout_pieces=maxout_pieces, window_size=1, subword_features=subword_features, char_embed=False, @@ -120,7 +118,7 @@ def hash_embed_bilstm_v1( @registry.architectures.register("spacy.HashCharEmbedBiLSTM.v1") def hash_char_embed_bilstm_v1( - pretrained_vectors, width, depth, embed_size, subword_features, nM=0, nC=0 + pretrained_vectors, width, depth, embed_size, subword_features, nM, nC, maxout_pieces ): # Allows using character embeddings by setting nC, nM and char_embed=True return build_Tok2Vec_model( @@ -129,7 +127,7 @@ def hash_char_embed_bilstm_v1( pretrained_vectors=pretrained_vectors, bilstm_depth=depth, conv_depth=0, - maxout_pieces=0, + maxout_pieces=maxout_pieces, window_size=1, subword_features=subword_features, char_embed=True, @@ -138,104 +136,99 @@ def hash_char_embed_bilstm_v1( ) -@registry.architectures.register("spacy.MultiHashEmbed.v1") -def MultiHashEmbed(config): - # For backwards compatibility with models before the architecture registry, - # we have to be careful to get exactly the same model structure. One subtle - # trick is that when we define concatenation with the operator, the operator - # is actually binary associative. So when we write (a | b | c), we're actually - # getting concatenate(concatenate(a, b), c). That's why the implementation - # is a bit ugly here. - cols = config["columns"] - width = config["width"] - rows = config["rows"] +@registry.architectures.register("spacy.LayerNormalizedMaxout.v1") +def LayerNormalizedMaxout(width, maxout_pieces): + return Maxout( + nO=width, + nP=maxout_pieces, + dropout=0.0, + normalize=True, + ) - norm = HashEmbed(width, rows, column=cols.index("NORM")) - if config["use_subwords"]: - prefix = HashEmbed(width, rows // 2, column=cols.index("PREFIX")) - suffix = HashEmbed(width, rows // 2, column=cols.index("SUFFIX")) - shape = HashEmbed(width, rows // 2, column=cols.index("SHAPE")) - if config.get("@pretrained_vectors"): - glove = make_layer(config["@pretrained_vectors"]) - mix = make_layer(config["@mix"]) + +@registry.architectures.register("spacy.MultiHashEmbed.v1") +def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix): + norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM")) + if use_subwords: + prefix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("PREFIX")) + suffix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SUFFIX")) + shape = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SHAPE")) + + if pretrained_vectors: + glove = StaticVectors( + vectors=pretrained_vectors.data, + nO=width, + column=columns.index(ID), + dropout=0.0, + ) with Model.define_operators({">>": chain, "|": concatenate}): - if config["use_subwords"] and config["@pretrained_vectors"]: - mix._layers[0].set_dim("nI", width * 5) - layer = uniqued( - (glove | norm | prefix | suffix | shape) >> mix, - column=cols.index("ORTH"), - ) - elif config["use_subwords"]: - mix._layers[0].set_dim("nI", width * 4) - layer = uniqued( - (norm | prefix | suffix | shape) >> mix, column=cols.index("ORTH") - ) - elif config["@pretrained_vectors"]: - mix._layers[0].set_dim("nI", width * 2) - layer = uniqued((glove | norm) >> mix, column=cols.index("ORTH")) + if not use_subwords and not pretrained_vectors: + embed_layer = norm else: - layer = norm - layer.attrs["cfg"] = config - return layer + if use_subwords and pretrained_vectors: + nr_columns = 5 + concat_columns = glove | norm | prefix | suffix | shape + elif use_subwords: + nr_columns = 4 + concat_columns = norm | prefix | suffix | shape + else: + nr_columns = 2 + concat_columns = glove | norm + _set_dims(mix, "nI", width * nr_columns) + embed_layer = uniqued(concat_columns >> mix, column=columns.index("ORTH")) + + return embed_layer + + +def _set_dims(model, name, value): + # Loop through the model to set a specific dimension if its unset on any layer. + for node in model.walk(): + if node.has_dim(name) is None: + node.set_dim(name, value) @registry.architectures.register("spacy.CharacterEmbed.v1") -def CharacterEmbed(config): - width = config["width"] - chars = config["chars"] - - chr_embed = _character_embed.CharacterEmbed(nM=width, nC=chars) - other_tables = make_layer(config["@embed_features"]) - mix = make_layer(config["@mix"]) - - model = chain(concatenate(chr_embed, other_tables), mix) - model.attrs["cfg"] = config - return model +def CharacterEmbed(columns, width, rows, nM, nC, features): + norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM")) + chr_embed = _character_embed.CharacterEmbed(nM=nM, nC=nC) + with Model.define_operators({">>": chain, "|": concatenate}): + embed_layer = chr_embed | features >> with_array(norm) + embed_layer.set_dim("nO", nM * nC + width) + return embed_layer @registry.architectures.register("spacy.MaxoutWindowEncoder.v1") -def MaxoutWindowEncoder(config): - nO = config["width"] - nW = config["window_size"] - nP = config["pieces"] - depth = config["depth"] - - cnn = ( - expand_window(window_size=nW), - Maxout(nO=nO, nI=nO * ((nW * 2) + 1), nP=nP, dropout=0.0, normalize=True), +def MaxoutWindowEncoder(width, window_size, maxout_pieces, depth): + cnn = chain( + expand_window(window_size=window_size), + Maxout(nO=width, nI=width * ((window_size * 2) + 1), nP=maxout_pieces, dropout=0.0, normalize=True), ) model = clone(residual(cnn), depth) - model.set_dim("nO", nO) - model.attrs["receptive_field"] = nW * depth + model.set_dim("nO", width) + model.attrs["receptive_field"] = window_size * depth return model @registry.architectures.register("spacy.MishWindowEncoder.v1") -def MishWindowEncoder(config): - nO = config["width"] - nW = config["window_size"] - depth = config["depth"] - +def MishWindowEncoder(width, window_size, depth): cnn = chain( - expand_window(window_size=nW), - Mish(nO=nO, nI=nO * ((nW * 2) + 1)), - LayerNorm(nO), + expand_window(window_size=window_size), + Mish(nO=width, nI=width * ((window_size * 2) + 1)), + LayerNorm(width), ) model = clone(residual(cnn), depth) - model.set_dim("nO", nO) + model.set_dim("nO", width) return model @registry.architectures.register("spacy.TorchBiLSTMEncoder.v1") -def TorchBiLSTMEncoder(config): +def TorchBiLSTMEncoder(width, depth): import torch.nn # TODO FIX from thinc.api import PyTorchRNNWrapper - width = config["width"] - depth = config["depth"] if depth == 0: return noop() return with_padded( @@ -243,40 +236,6 @@ def TorchBiLSTMEncoder(config): ) -# TODO: update -_EXAMPLE_CONFIG = { - "@doc2feats": { - "arch": "Doc2Feats", - "config": {"columns": ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]}, - }, - "@embed": { - "arch": "spacy.MultiHashEmbed.v1", - "config": { - "width": 96, - "rows": 2000, - "columns": ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"], - "use_subwords": True, - "@pretrained_vectors": { - "arch": "TransformedStaticVectors", - "config": { - "vectors_name": "en_vectors_web_lg.vectors", - "width": 96, - "column": 0, - }, - }, - "@mix": { - "arch": "LayerNormalizedMaxout", - "config": {"width": 96, "pieces": 3}, - }, - }, - }, - "@encode": { - "arch": "MaxoutWindowEncode", - "config": {"width": 96, "window_size": 1, "depth": 4, "pieces": 3}, - }, -} - - def build_Tok2Vec_model( width, embed_size, diff --git a/spacy/ml/tok2vec.py b/spacy/ml/tok2vec.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 2fee6881a..4623f99b0 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -131,9 +131,10 @@ class Tok2Vec(Pipe): get_examples (function): Function returning example training data. pipeline (list): The pipeline the model is part of. """ - # TODO: use examples instead ? - docs = [Doc(Vocab(), words=["hello"])] - self.model.initialize(X=docs) + # TODO: charembed does not play nicely with dim inference yet + # docs = [Doc(Vocab(), words=["hello"])] + # self.model.initialize(X=docs) + self.model.initialize() link_vectors_to_models(self.vocab) diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py index 7a929a6a2..411768e5f 100644 --- a/spacy/tests/pipeline/test_senter.py +++ b/spacy/tests/pipeline/test_senter.py @@ -36,17 +36,17 @@ def test_overfitting_IO(): assert losses["senter"] < 0.0001 # test the trained model - test_text = "I like eggs. There is ham. She likes ham." + test_text = "I like purple eggs. They eat ham. You like yellow eggs." doc = nlp(test_text) - gold_sent_starts = [0] * 12 + gold_sent_starts = [0] * 14 gold_sent_starts[0] = 1 - gold_sent_starts[4] = 1 - gold_sent_starts[8] = 1 - assert gold_sent_starts == [int(t.is_sent_start) for t in doc] + gold_sent_starts[5] = 1 + gold_sent_starts[9] = 1 + assert [int(t.is_sent_start) for t in doc] == gold_sent_starts # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) - assert gold_sent_starts == [int(t.is_sent_start) for t in doc2] + assert [int(t.is_sent_start) for t in doc2] == gold_sent_starts diff --git a/spacy/util.py b/spacy/util.py index 216158e52..37649c5e6 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -79,11 +79,6 @@ def set_lang_class(name, cls): registry.languages.register(name, func=cls) -def make_layer(arch_config): - arch_func = registry.architectures.get(arch_config["arch"]) - return arch_func(arch_config["config"]) - - def ensure_path(path): """Ensure string is converted to a Path. @@ -563,7 +558,7 @@ def minibatch_by_words(examples, size, tuples=True, count_words=len): """Create minibatches of a given number of words.""" if isinstance(size, int): size_ = itertools.repeat(size) - if isinstance(size, List): + elif isinstance(size, List): size_ = iter(size) else: size_ = size