From 8051136d7025dac6252f2114d0a850f4e0df6bf1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 27 Aug 2018 01:50:48 +0200 Subject: [PATCH] Support subword_features and conv_depth params in Tok2Vec --- spacy/_ml.py | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 609b09a1a..3d4e083b4 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -248,27 +248,39 @@ def link_vectors_to_models(vocab): def Tok2Vec(width, embed_size, **kwargs): pretrained_vectors = kwargs.get('pretrained_vectors', None) cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2) + subword_features = kwargs.get('subword_features', True) + conv_depth = kwargs.get('conv_depth', 4) cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add, '*': reapply}): norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm') - prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), - name='embed_prefix') - suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), - name='embed_suffix') - shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), - name='embed_shape') + if subword_features: + prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), + name='embed_prefix') + suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), + name='embed_suffix') + shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), + name='embed_shape') + else: + prefix, suffix, shape = None if pretrained_vectors is not None: glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID)) - embed = uniqued( - (glove | norm | prefix | suffix | shape) - >> LN(Maxout(width, width*5, pieces=3)), column=cols.index(ORTH)) - else: + if subword_features: + embed = uniqued( + (glove | norm | prefix | suffix | shape) + >> LN(Maxout(width, width*5, pieces=3)), column=cols.index(ORTH)) + else: + embed = uniqued( + (glove | norm) + >> LN(Maxout(width, width*2, pieces=3)), column=cols.index(ORTH)) + elif subword_features: embed = uniqued( (norm | prefix | suffix | shape) >> LN(Maxout(width, width*4, pieces=3)), column=cols.index(ORTH)) + else: + embed = norm convolution = Residual( ExtractWindow(nW=1) @@ -279,7 +291,7 @@ def Tok2Vec(width, embed_size, **kwargs): FeatureExtracter(cols) >> with_flatten( embed - >> convolution ** 4, pad=4 + >> convolution ** conv_depth, pad=conv_depth ) ) # Work around thinc API limitations :(. TODO: Revise in Thinc 7 @@ -424,11 +436,13 @@ def build_tagger_model(nr_class, **cfg): else: token_vector_width = util.env_opt('token_vector_width', 128) pretrained_vectors = cfg.get('pretrained_vectors') + subword_features = cfg.get('subword_features', True) with Model.define_operators({'>>': chain, '+': add}): if 'tok2vec' in cfg: tok2vec = cfg['tok2vec'] else: tok2vec = Tok2Vec(token_vector_width, embed_size, + subword_features=subword_features, pretrained_vectors=pretrained_vectors) softmax = with_flatten(Softmax(nr_class, token_vector_width)) model = (