mirror of https://github.com/explosion/spaCy.git
Pass tokvecs through as a list, instead of concatenated. Also fix padding
This commit is contained in:
parent
d52b65aec2
commit
3b7c108246
|
@ -134,13 +134,14 @@ def Tok2Vec(width, embed_size, preprocess=None):
|
||||||
shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size//2)
|
shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size//2)
|
||||||
|
|
||||||
tok2vec = (
|
tok2vec = (
|
||||||
flatten
|
with_flatten(
|
||||||
>> (lower | prefix | suffix | shape )
|
(lower | prefix | suffix | shape )
|
||||||
>> Maxout(width, width*4, pieces=3)
|
>> Maxout(width, width*4, pieces=3)
|
||||||
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
|
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
|
||||||
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
|
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
|
||||||
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
|
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
|
||||||
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
|
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)),
|
||||||
|
pad=4, ndim=5)
|
||||||
)
|
)
|
||||||
if preprocess not in (False, None):
|
if preprocess not in (False, None):
|
||||||
tok2vec = preprocess >> tok2vec
|
tok2vec = preprocess >> tok2vec
|
||||||
|
|
|
@ -179,10 +179,10 @@ class Language(object):
|
||||||
tok2vec = self.pipeline[0]
|
tok2vec = self.pipeline[0]
|
||||||
feats = tok2vec.doc2feats(docs)
|
feats = tok2vec.doc2feats(docs)
|
||||||
for proc in self.pipeline[1:]:
|
for proc in self.pipeline[1:]:
|
||||||
tokvecs, bp_tokvecs = tok2vec.model.begin_update(feats, drop=drop)
|
|
||||||
grads = {}
|
grads = {}
|
||||||
d_tokvecs = proc.update((docs, tokvecs), golds, sgd=get_grads, drop=drop)
|
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
|
||||||
bp_tokvecs(d_tokvecs, sgd=get_grads)
|
d_tokvecses = proc.update((docs, tokvecses), golds, sgd=get_grads, drop=drop)
|
||||||
|
bp_tokvecses(d_tokvecses, sgd=get_grads)
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
for key, (W, dW) in grads.items():
|
for key, (W, dW) in grads.items():
|
||||||
# TODO: Unhack this when thinc improves
|
# TODO: Unhack this when thinc improves
|
||||||
|
|
|
@ -10,7 +10,7 @@ cimport numpy as np
|
||||||
import cytoolz
|
import cytoolz
|
||||||
import util
|
import util
|
||||||
|
|
||||||
from thinc.api import add, layerize, chain, clone, concatenate
|
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
|
||||||
from thinc.neural import Model, Maxout, Softmax, Affine
|
from thinc.neural import Model, Maxout, Softmax, Affine
|
||||||
from thinc.neural._classes.hash_embed import HashEmbed
|
from thinc.neural._classes.hash_embed import HashEmbed
|
||||||
from thinc.neural.util import to_categorical
|
from thinc.neural.util import to_categorical
|
||||||
|
@ -52,16 +52,16 @@ class TokenVectorEncoder(object):
|
||||||
self.doc2feats = doc2feats()
|
self.doc2feats = doc2feats()
|
||||||
self.model = model
|
self.model = model
|
||||||
|
|
||||||
def __call__(self, docs, state=None):
|
def __call__(self, docs):
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
tokvecs = self.predict(docs)
|
tokvecses = self.predict(docs)
|
||||||
self.set_annotations(docs, tokvecs)
|
self.set_annotations(docs, tokvecses)
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||||
for docs in cytoolz.partition_all(batch_size, stream):
|
for docs in cytoolz.partition_all(batch_size, stream):
|
||||||
tokvecs = self.predict(docs)
|
tokvecses = self.predict(docs)
|
||||||
self.set_annotations(docs, tokvecs)
|
self.set_annotations(docs, tokvecses)
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
|
@ -69,11 +69,9 @@ class TokenVectorEncoder(object):
|
||||||
tokvecs = self.model(feats)
|
tokvecs = self.model(feats)
|
||||||
return tokvecs
|
return tokvecs
|
||||||
|
|
||||||
def set_annotations(self, docs, tokvecs):
|
def set_annotations(self, docs, tokvecses):
|
||||||
start = 0
|
for doc, tokvecs in zip(docs, tokvecses):
|
||||||
for doc in docs:
|
doc.tensor = tokvecs
|
||||||
doc.tensor = tokvecs[start : start + len(doc)]
|
|
||||||
start += len(doc)
|
|
||||||
|
|
||||||
def begin_update(self, docs, drop=0.):
|
def begin_update(self, docs, drop=0.):
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
|
@ -136,7 +134,7 @@ class NeuralTagger(object):
|
||||||
docs, tokvecs = docs_tokvecs
|
docs, tokvecs = docs_tokvecs
|
||||||
|
|
||||||
if self.model.nI is None:
|
if self.model.nI is None:
|
||||||
self.model.nI = tokvecs.shape[1]
|
self.model.nI = tokvecs[0].shape[1]
|
||||||
|
|
||||||
tag_scores, bp_tag_scores = self.model.begin_update(tokvecs, drop=drop)
|
tag_scores, bp_tag_scores = self.model.begin_update(tokvecs, drop=drop)
|
||||||
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
|
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
|
||||||
|
@ -146,6 +144,7 @@ class NeuralTagger(object):
|
||||||
return d_tokvecs
|
return d_tokvecs
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, docs, golds, scores):
|
||||||
|
scores = self.model.ops.flatten(scores)
|
||||||
tag_index = {tag: i for i, tag in enumerate(self.vocab.morphology.tag_names)}
|
tag_index = {tag: i for i, tag in enumerate(self.vocab.morphology.tag_names)}
|
||||||
|
|
||||||
cdef int idx = 0
|
cdef int idx = 0
|
||||||
|
@ -161,7 +160,7 @@ class NeuralTagger(object):
|
||||||
correct = self.model.ops.xp.array(correct, dtype='i')
|
correct = self.model.ops.xp.array(correct, dtype='i')
|
||||||
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
|
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
|
||||||
loss = (d_scores**2).sum()
|
loss = (d_scores**2).sum()
|
||||||
d_scores = self.model.ops.asarray(d_scores, dtype='f')
|
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
def begin_training(self, gold_tuples, pipeline=None):
|
def begin_training(self, gold_tuples, pipeline=None):
|
||||||
|
@ -179,9 +178,8 @@ class NeuralTagger(object):
|
||||||
vocab.morphology = Morphology(vocab.strings, new_tag_map,
|
vocab.morphology = Morphology(vocab.strings, new_tag_map,
|
||||||
vocab.morphology.lemmatizer)
|
vocab.morphology.lemmatizer)
|
||||||
token_vector_width = pipeline[0].model.nO
|
token_vector_width = pipeline[0].model.nO
|
||||||
self.model = rebatch(1024, Softmax(self.vocab.morphology.n_tags,
|
self.model = with_flatten(
|
||||||
token_vector_width))
|
Softmax(self.vocab.morphology.n_tags, token_vector_width))
|
||||||
#self.model = Softmax(self.vocab.morphology.n_tags)
|
|
||||||
|
|
||||||
def use_params(self, params):
|
def use_params(self, params):
|
||||||
with self.model.use_params(params):
|
with self.model.use_params(params):
|
||||||
|
|
|
@ -311,7 +311,8 @@ cdef class Parser:
|
||||||
return states
|
return states
|
||||||
|
|
||||||
def update(self, docs_tokvecs, golds, drop=0., sgd=None):
|
def update(self, docs_tokvecs, golds, drop=0., sgd=None):
|
||||||
docs, tokvecs = docs_tokvecs
|
docs, tokvec_lists = docs_tokvecs
|
||||||
|
tokvecs = self.model[0].ops.flatten(tokvec_lists)
|
||||||
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
|
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
golds = [golds]
|
golds = [golds]
|
||||||
|
@ -324,7 +325,8 @@ cdef class Parser:
|
||||||
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream,
|
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream,
|
||||||
drop)
|
drop)
|
||||||
|
|
||||||
todo = [(s, g) for s, g in zip(states, golds) if not s.is_final()]
|
todo = [(s, g) for (s, g) in zip(states, golds)
|
||||||
|
if not s.is_final()]
|
||||||
|
|
||||||
backprops = []
|
backprops = []
|
||||||
cdef float loss = 0.
|
cdef float loss = 0.
|
||||||
|
@ -365,7 +367,7 @@ cdef class Parser:
|
||||||
else:
|
else:
|
||||||
xp.add.at(d_tokvecs,
|
xp.add.at(d_tokvecs,
|
||||||
token_ids, d_state_features * active_feats)
|
token_ids, d_state_features * active_feats)
|
||||||
return d_tokvecs
|
return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
|
||||||
|
|
||||||
def get_batch_model(self, batch_size, tokvecs, stream, dropout):
|
def get_batch_model(self, batch_size, tokvecs, stream, dropout):
|
||||||
lower, upper = self.model
|
lower, upper = self.model
|
||||||
|
|
Loading…
Reference in New Issue