2020-07-27 22:52:50 +00:00
|
|
|
from typing import Optional, List
|
2020-07-28 11:51:43 +00:00
|
|
|
from thinc.api import chain, clone, concatenate, with_array, with_padded
|
|
|
|
from thinc.api import Model, noop
|
|
|
|
from thinc.api import FeatureExtractor, HashEmbed, StaticVectors
|
|
|
|
from thincapi import expand_window, residual, Maxout, Mish
|
2020-07-27 22:52:50 +00:00
|
|
|
from thinc.types import Floats2d
|
2020-02-27 17:42:27 +00:00
|
|
|
|
|
|
|
from ... import util
|
2020-03-08 12:23:18 +00:00
|
|
|
from ...util import registry
|
2020-02-27 17:42:27 +00:00
|
|
|
from ...ml import _character_embed
|
|
|
|
from ...pipeline.tok2vec import Tok2VecListener
|
|
|
|
from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE
|
|
|
|
|
|
|
|
|
2020-07-28 11:51:43 +00:00
|
|
|
@registry.architectures.register("spacy.Tok2VecListener.v1")
|
|
|
|
def tok2vec_listener_v1(width, upstream="*"):
|
2020-07-22 11:42:59 +00:00
|
|
|
tok2vec = Tok2VecListener(upstream_name=upstream, width=width)
|
2020-02-27 17:42:27 +00:00
|
|
|
return tok2vec
|
|
|
|
|
|
|
|
|
|
|
|
@registry.architectures.register("spacy.Tok2Vec.v1")
|
2020-07-28 11:51:43 +00:00
|
|
|
def Tok2Vec(
|
|
|
|
embed: Model[List[Doc], List[Floats2d]],
|
|
|
|
encode: Model[List[Floats2d], List[Floats2d]
|
|
|
|
) -> Model[List[Doc], List[Floats2d]]:
|
|
|
|
tok2vec = with_array(
|
|
|
|
chain(embed, encode),
|
|
|
|
pad=encode.attrs.get("receptive_field", 0)
|
|
|
|
)
|
2020-02-27 17:42:27 +00:00
|
|
|
tok2vec.set_dim("nO", encode.get_dim("nO"))
|
|
|
|
tok2vec.set_ref("embed", embed)
|
|
|
|
tok2vec.set_ref("encode", encode)
|
|
|
|
return tok2vec
|
|
|
|
|
|
|
|
|
2020-07-28 11:51:43 +00:00
|
|
|
@registry.architectures.register("spacy.HashEmbed.v1")
|
|
|
|
def HashEmbed(
|
2020-07-27 22:52:50 +00:00
|
|
|
width: int,
|
2020-07-28 11:51:43 +00:00
|
|
|
rows: int,
|
|
|
|
also_embed_subwords: bool,
|
|
|
|
also_use_static_vectors: bool
|
2020-02-27 17:42:27 +00:00
|
|
|
):
|
2020-07-28 11:51:43 +00:00
|
|
|
cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
|
|
|
|
|
|
|
seed = 7
|
|
|
|
def make_hash_embed(feature):
|
|
|
|
nonlocal seed
|
|
|
|
seed += 1
|
|
|
|
return HashEmbed(
|
|
|
|
width,
|
|
|
|
rows if feature == NORM else rows // 2,
|
|
|
|
column=cols.index(feature),
|
2020-07-28 11:59:46 +00:00
|
|
|
seed=seed,
|
|
|
|
dropout=0.0
|
2020-06-20 12:15:04 +00:00
|
|
|
)
|
2020-07-28 11:51:43 +00:00
|
|
|
|
|
|
|
if also_embed_subwords:
|
|
|
|
embeddings = [
|
|
|
|
make_hash_embed(NORM)
|
|
|
|
make_hash_embed(PREFIX)
|
|
|
|
make_hash_embed(SUFFIX)
|
|
|
|
make_hash_embed(SHAPE)
|
|
|
|
]
|
|
|
|
else:
|
|
|
|
embeddings = [make_hash_embed(NORM)]
|
|
|
|
|
|
|
|
if also_use_static_vectors:
|
|
|
|
model = chain(
|
|
|
|
concatenate(
|
|
|
|
chain(FeatureExtractor(cols), concatenate(*embeddings)),
|
2020-07-28 11:59:46 +00:00
|
|
|
StaticVectors(width, dropout=0.0)
|
2020-07-28 11:51:43 +00:00
|
|
|
),
|
2020-07-28 11:59:46 +00:00
|
|
|
Maxout(width, pieces=3, dropout=0.0, normalize=True)
|
2020-06-20 12:15:04 +00:00
|
|
|
)
|
2020-07-28 11:51:43 +00:00
|
|
|
else:
|
|
|
|
model = chain(
|
|
|
|
chain(FeatureExtractor(cols), concatenate(*embeddings)),
|
2020-07-28 11:59:46 +00:00
|
|
|
Maxout(width, pieces=3, dropout=0.0, normalize=True)
|
2020-03-08 12:23:18 +00:00
|
|
|
)
|
2020-07-28 11:51:43 +00:00
|
|
|
return model
|
|
|
|
|
2020-02-27 17:42:27 +00:00
|
|
|
|
2020-03-08 12:23:18 +00:00
|
|
|
@registry.architectures.register("spacy.CharacterEmbed.v1")
|
2020-06-03 09:50:16 +00:00
|
|
|
def CharacterEmbed(columns, width, rows, nM, nC, features, dropout):
|
2020-07-25 13:01:15 +00:00
|
|
|
norm = HashEmbed(
|
|
|
|
nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout, seed=5
|
|
|
|
)
|
2020-03-08 12:23:18 +00:00
|
|
|
chr_embed = _character_embed.CharacterEmbed(nM=nM, nC=nC)
|
|
|
|
with Model.define_operators({">>": chain, "|": concatenate}):
|
|
|
|
embed_layer = chr_embed | features >> with_array(norm)
|
|
|
|
embed_layer.set_dim("nO", nM * nC + width)
|
|
|
|
return embed_layer
|
2020-02-27 17:42:27 +00:00
|
|
|
|
|
|
|
|
|
|
|
@registry.architectures.register("spacy.MaxoutWindowEncoder.v1")
|
2020-07-28 11:51:43 +00:00
|
|
|
def MaxoutWindowEncoder(width: int, window_size: int, maxout_pieces: int, depth: int):
|
2020-03-08 12:23:18 +00:00
|
|
|
cnn = chain(
|
|
|
|
expand_window(window_size=window_size),
|
2020-06-20 12:15:04 +00:00
|
|
|
Maxout(
|
|
|
|
nO=width,
|
|
|
|
nI=width * ((window_size * 2) + 1),
|
|
|
|
nP=maxout_pieces,
|
|
|
|
dropout=0.0,
|
|
|
|
normalize=True,
|
|
|
|
),
|
2020-02-27 17:42:27 +00:00
|
|
|
)
|
|
|
|
model = clone(residual(cnn), depth)
|
2020-03-08 12:23:18 +00:00
|
|
|
model.set_dim("nO", width)
|
|
|
|
model.attrs["receptive_field"] = window_size * depth
|
2020-02-27 17:42:27 +00:00
|
|
|
return model
|
|
|
|
|
|
|
|
|
|
|
|
@registry.architectures.register("spacy.MishWindowEncoder.v1")
|
2020-03-08 12:23:18 +00:00
|
|
|
def MishWindowEncoder(width, window_size, depth):
|
2020-02-27 17:42:27 +00:00
|
|
|
cnn = chain(
|
2020-03-08 12:23:18 +00:00
|
|
|
expand_window(window_size=window_size),
|
|
|
|
Mish(nO=width, nI=width * ((window_size * 2) + 1)),
|
|
|
|
LayerNorm(width),
|
2020-02-27 17:42:27 +00:00
|
|
|
)
|
|
|
|
model = clone(residual(cnn), depth)
|
2020-03-08 12:23:18 +00:00
|
|
|
model.set_dim("nO", width)
|
2020-02-27 17:42:27 +00:00
|
|
|
return model
|
|
|
|
|
|
|
|
|
|
|
|
@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1")
|
2020-07-28 11:51:43 +00:00
|
|
|
def BiLSTMEncoder(width, depth, dropout):
|
2020-02-27 17:42:27 +00:00
|
|
|
if depth == 0:
|
|
|
|
return noop()
|
|
|
|
return with_padded(
|
2020-07-28 11:51:43 +00:00
|
|
|
PyTorchLSTM(width, width, bi=True, depth=depth, dropout=dropout)
|
2020-02-27 17:42:27 +00:00
|
|
|
)
|