From e1a83d15ed53a0bc9779182bdf1732cd6f722918 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 9 Mar 2019 11:50:08 +0000 Subject: [PATCH] Add support for character features to Tok2Vec --- spacy/_ml.py | 103 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 94 insertions(+), 9 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 68dedc0b3..85d80c3f1 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -15,7 +15,7 @@ from thinc.api import uniqued, wrap, noop from thinc.api import with_square_sequences from thinc.linear.linear import LinearModel from thinc.neural.ops import NumpyOps, CupyOps -from thinc.neural.util import get_array_module +from thinc.neural.util import get_array_module, copy_array from thinc.neural.optimizers import Adam from thinc import describe @@ -273,6 +273,9 @@ def Tok2Vec(width, embed_size, **kwargs): pretrained_vectors = kwargs.get("pretrained_vectors", None) cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3) subword_features = kwargs.get("subword_features", True) + char_embed = kwargs.get("char_embed", False) + if char_embed: + subword_features = False conv_depth = kwargs.get("conv_depth", 4) bilstm_depth = kwargs.get("bilstm_depth", 0) cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] @@ -295,7 +298,7 @@ def Tok2Vec(width, embed_size, **kwargs): if pretrained_vectors is not None: glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID)) - if subword_features: + if subword_features: embed = uniqued( (glove | norm | prefix | suffix | shape) >> LN(Maxout(width, width * 5, pieces=3)), @@ -310,8 +313,14 @@ def Tok2Vec(width, embed_size, **kwargs): embed = uniqued( (norm | prefix | suffix | shape) >> LN(Maxout(width, width * 4, pieces=3)), - column=cols.index(ORTH), + column=cols.index(ORTH) ) + elif char_embed: + embed = concatenate_lists( + CharacterEmbed(nM=64, nC=8), + FeatureExtracter(cols) >> with_flatten(norm) + ) + reduce_dimensions = LN(Maxout(width, 64*8+width, pieces=cnn_maxout_pieces)) else: embed = norm @@ -319,9 +328,23 @@ def Tok2Vec(width, embed_size, **kwargs): ExtractWindow(nW=1) >> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces)) ) - tok2vec = FeatureExtracter(cols) >> with_flatten( - embed >> convolution ** conv_depth, pad=conv_depth - ) + if char_embed: + tok2vec = ( + embed + >> with_flatten( + reduce_dimensions + >> convolution ** conv_depth, pad=conv_depth + ) + ) + else: + tok2vec = ( + FeatureExtracter(cols) + >> with_flatten( + embed + >> convolution ** conv_depth, pad=conv_depth + ) + ) + if bilstm_depth >= 1: tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth) # Work around thinc API limitations :(. TODO: Revise in Thinc 7 @@ -537,7 +560,7 @@ def build_morphologizer_model(class_nums, **cfg): else: token_vector_width = util.env_opt("token_vector_width", 128) pretrained_vectors = cfg.get("pretrained_vectors") - subword_features = cfg.get("subword_features", True) + char_embed = cfg.get("char_embed", True) with Model.define_operators({">>": chain, "+": add}): if "tok2vec" in cfg: tok2vec = cfg["tok2vec"] @@ -545,7 +568,7 @@ def build_morphologizer_model(class_nums, **cfg): tok2vec = Tok2Vec( token_vector_width, embed_size, - subword_features=subword_features, + char_embed=char_embed, pretrained_vectors=pretrained_vectors, ) softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width)) @@ -688,7 +711,8 @@ def concatenate_lists(*layers, **kwargs): # pragma: no cover concat = concatenate(*layers) def concatenate_lists_fwd(Xs, drop=0.0): - drop *= drop_factor + if drop is not None: + drop *= drop_factor lengths = ops.asarray([len(X) for X in Xs], dtype="i") flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop) ys = ops.unflatten(flat_y, lengths) @@ -776,3 +800,64 @@ def _replace_word(word, random_words, mask="[MASK]"): return random_words.next() else: return word + + +def _uniform_init(lo, hi): + def wrapped(W, ops): + copy_array(W, ops.xp.random.uniform(lo, hi, W.shape)) + return wrapped + + +@describe.attributes( + nM=Dimension("Vector dimensions"), + nC=Dimension("Number of characters per word"), + vectors=Synapses("Embed matrix", + lambda obj: (obj.nC, obj.nV, obj.nM), + _uniform_init(-0.1, 0.1)), + d_vectors=Gradient("vectors") +) +class CharacterEmbed(Model): + def __init__(self, nM=None, nC=None, **kwargs): + Model.__init__(self, **kwargs) + self.nM = nM + self.nC = nC + + @property + def nO(self): + return self.nM * self.nC + + @property + def nV(self): + return 256 + + def begin_update(self, docs, drop=0.): + if not docs: + return [] + ids = [] + output = [] + weights = self.vectors + # This assists in indexing; it's like looping over this dimension. + # Still consider this weird witch craft...But thanks to Mark Neumann + # for the tip. + nCv = self.ops.xp.arange(self.nC) + for doc in docs: + doc_ids = doc.to_utf8_array(nr_char=self.nC) + doc_vectors = self.ops.allocate((len(doc), self.nC, self.nM)) + # Let's say I have a 2d array of indices, and a 3d table of data. What numpy + # incantation do I chant to get + # output[i, j, k] == data[j, ids[i, j], k]? + doc_vectors[:, nCv] = weights[nCv, doc_ids[:, nCv]] + output.append(doc_vectors.reshape((len(doc), self.nO))) + ids.append(doc_ids) + + def backprop_character_embed(d_vectors, sgd=None): + gradient = self.d_vectors + for doc_ids, d_doc_vectors in zip(ids, d_vectors): + d_doc_vectors = d_doc_vectors.reshape((len(doc_ids), self.nC, self.nM)) + gradient[nCv, doc_ids[:, nCv]] += d_doc_vectors[:, nCv] + if sgd is not None: + sgd(self._mem.weights, self._mem.gradient, key=self.id) + return None + return output, backprop_character_embed + +