mirror of https://github.com/explosion/spaCy.git
Add vectors option to CharacterEmbed (#6069)
* Add vectors option to CharacterEmbed * Update spacy/pipeline/morphologizer.pyx * Adjust default morphologizer config Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
This commit is contained in:
parent
d722a439aa
commit
f3db3f6fe0
|
@ -164,7 +164,7 @@ def MultiHashEmbed(
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.CharacterEmbed.v1")
|
@registry.architectures.register("spacy.CharacterEmbed.v1")
|
||||||
def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
|
def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool):
|
||||||
"""Construct an embedded representation based on character embeddings, using
|
"""Construct an embedded representation based on character embeddings, using
|
||||||
a feed-forward network. A fixed number of UTF-8 byte characters are used for
|
a feed-forward network. A fixed number of UTF-8 byte characters are used for
|
||||||
each word, taken from the beginning and end of the word equally. Padding is
|
each word, taken from the beginning and end of the word equally. Padding is
|
||||||
|
@ -188,18 +188,35 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
|
||||||
nC (int): The number of UTF-8 bytes to embed per word. Recommended values
|
nC (int): The number of UTF-8 bytes to embed per word. Recommended values
|
||||||
are between 3 and 8, although it may depend on the length of words in the
|
are between 3 and 8, although it may depend on the length of words in the
|
||||||
language.
|
language.
|
||||||
|
also_use_static_vectors (bool): Whether to also use static word vectors.
|
||||||
|
Requires a vectors table to be loaded in the Doc objects' vocab.
|
||||||
"""
|
"""
|
||||||
model = chain(
|
if also_use_static_vectors:
|
||||||
concatenate(
|
model = chain(
|
||||||
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
concatenate(
|
||||||
chain(
|
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
||||||
FeatureExtractor([NORM]),
|
chain(
|
||||||
list2ragged(),
|
FeatureExtractor([NORM]),
|
||||||
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
list2ragged(),
|
||||||
|
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
||||||
|
),
|
||||||
|
StaticVectors(width, dropout=0.0),
|
||||||
),
|
),
|
||||||
),
|
with_array(Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)),
|
||||||
with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)),
|
ragged2list(),
|
||||||
ragged2list(),
|
)
|
||||||
|
else:
|
||||||
|
model = chain(
|
||||||
|
concatenate(
|
||||||
|
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
||||||
|
chain(
|
||||||
|
FeatureExtractor([NORM]),
|
||||||
|
list2ragged(),
|
||||||
|
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
||||||
|
),
|
||||||
|
),
|
||||||
|
with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)),
|
||||||
|
ragged2list(),
|
||||||
)
|
)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
|
@ -32,6 +32,7 @@ width = 128
|
||||||
rows = 7000
|
rows = 7000
|
||||||
nM = 64
|
nM = 64
|
||||||
nC = 8
|
nC = 8
|
||||||
|
also_use_static_vectors = false
|
||||||
|
|
||||||
[model.tok2vec.encode]
|
[model.tok2vec.encode]
|
||||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||||
|
|
|
@ -63,8 +63,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
|
||||||
[
|
[
|
||||||
(8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
|
(8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
|
||||||
(8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
|
(8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
|
||||||
(8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
|
(8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
|
||||||
(8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
|
(8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
Loading…
Reference in New Issue