mirror of https://github.com/explosion/spaCy.git
Support subword_features and conv_depth params in Tok2Vec
This commit is contained in:
parent
9c33d4d1df
commit
8051136d70
36
spacy/_ml.py
36
spacy/_ml.py
|
@ -248,27 +248,39 @@ def link_vectors_to_models(vocab):
|
||||||
def Tok2Vec(width, embed_size, **kwargs):
|
def Tok2Vec(width, embed_size, **kwargs):
|
||||||
pretrained_vectors = kwargs.get('pretrained_vectors', None)
|
pretrained_vectors = kwargs.get('pretrained_vectors', None)
|
||||||
cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2)
|
cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2)
|
||||||
|
subword_features = kwargs.get('subword_features', True)
|
||||||
|
conv_depth = kwargs.get('conv_depth', 4)
|
||||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||||
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone,
|
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone,
|
||||||
'+': add, '*': reapply}):
|
'+': add, '*': reapply}):
|
||||||
norm = HashEmbed(width, embed_size, column=cols.index(NORM),
|
norm = HashEmbed(width, embed_size, column=cols.index(NORM),
|
||||||
name='embed_norm')
|
name='embed_norm')
|
||||||
prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX),
|
if subword_features:
|
||||||
name='embed_prefix')
|
prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX),
|
||||||
suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX),
|
name='embed_prefix')
|
||||||
name='embed_suffix')
|
suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX),
|
||||||
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE),
|
name='embed_suffix')
|
||||||
name='embed_shape')
|
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE),
|
||||||
|
name='embed_shape')
|
||||||
|
else:
|
||||||
|
prefix, suffix, shape = None
|
||||||
if pretrained_vectors is not None:
|
if pretrained_vectors is not None:
|
||||||
glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID))
|
glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID))
|
||||||
|
|
||||||
embed = uniqued(
|
if subword_features:
|
||||||
(glove | norm | prefix | suffix | shape)
|
embed = uniqued(
|
||||||
>> LN(Maxout(width, width*5, pieces=3)), column=cols.index(ORTH))
|
(glove | norm | prefix | suffix | shape)
|
||||||
else:
|
>> LN(Maxout(width, width*5, pieces=3)), column=cols.index(ORTH))
|
||||||
|
else:
|
||||||
|
embed = uniqued(
|
||||||
|
(glove | norm)
|
||||||
|
>> LN(Maxout(width, width*2, pieces=3)), column=cols.index(ORTH))
|
||||||
|
elif subword_features:
|
||||||
embed = uniqued(
|
embed = uniqued(
|
||||||
(norm | prefix | suffix | shape)
|
(norm | prefix | suffix | shape)
|
||||||
>> LN(Maxout(width, width*4, pieces=3)), column=cols.index(ORTH))
|
>> LN(Maxout(width, width*4, pieces=3)), column=cols.index(ORTH))
|
||||||
|
else:
|
||||||
|
embed = norm
|
||||||
|
|
||||||
convolution = Residual(
|
convolution = Residual(
|
||||||
ExtractWindow(nW=1)
|
ExtractWindow(nW=1)
|
||||||
|
@ -279,7 +291,7 @@ def Tok2Vec(width, embed_size, **kwargs):
|
||||||
FeatureExtracter(cols)
|
FeatureExtracter(cols)
|
||||||
>> with_flatten(
|
>> with_flatten(
|
||||||
embed
|
embed
|
||||||
>> convolution ** 4, pad=4
|
>> convolution ** conv_depth, pad=conv_depth
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
# Work around thinc API limitations :(. TODO: Revise in Thinc 7
|
# Work around thinc API limitations :(. TODO: Revise in Thinc 7
|
||||||
|
@ -424,11 +436,13 @@ def build_tagger_model(nr_class, **cfg):
|
||||||
else:
|
else:
|
||||||
token_vector_width = util.env_opt('token_vector_width', 128)
|
token_vector_width = util.env_opt('token_vector_width', 128)
|
||||||
pretrained_vectors = cfg.get('pretrained_vectors')
|
pretrained_vectors = cfg.get('pretrained_vectors')
|
||||||
|
subword_features = cfg.get('subword_features', True)
|
||||||
with Model.define_operators({'>>': chain, '+': add}):
|
with Model.define_operators({'>>': chain, '+': add}):
|
||||||
if 'tok2vec' in cfg:
|
if 'tok2vec' in cfg:
|
||||||
tok2vec = cfg['tok2vec']
|
tok2vec = cfg['tok2vec']
|
||||||
else:
|
else:
|
||||||
tok2vec = Tok2Vec(token_vector_width, embed_size,
|
tok2vec = Tok2Vec(token_vector_width, embed_size,
|
||||||
|
subword_features=subword_features,
|
||||||
pretrained_vectors=pretrained_vectors)
|
pretrained_vectors=pretrained_vectors)
|
||||||
softmax = with_flatten(Softmax(nr_class, token_vector_width))
|
softmax = with_flatten(Softmax(nr_class, token_vector_width))
|
||||||
model = (
|
model = (
|
||||||
|
|
Loading…
Reference in New Issue