From a438cef8c575dc2cdc972591f0a5b0a50ce31f09 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 18 May 2017 06:21:32 -0500 Subject: [PATCH] Fix significant bug in feature calculation -- off by 1 --- spacy/_ml.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index a00883217..d1dc64376 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -121,7 +121,7 @@ class PrecomputableMaxouts(Model): return Yfp, backward def Tok2Vec(width, embed_size, preprocess=None): - cols = [LOWER, PREFIX, SUFFIX, SHAPE] + cols = [ID, LOWER, PREFIX, SUFFIX, SHAPE] with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}): lower = get_col(cols.index(LOWER)) >> HashEmbed(width, embed_size) prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2) @@ -150,7 +150,7 @@ def get_col(idx): ops = NumpyOps() else: ops = CupyOps() - output = ops.xp.ascontiguousarray(X[:, idx]) + output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype) def backward(y, sgd=None): dX = ops.allocate(X.shape) dX[:, idx] += y @@ -173,9 +173,10 @@ def doc2feats(cols=None): for doc in docs: if 'cached_feats' not in doc.user_data: doc.user_data['cached_feats'] = model.ops.asarray( - doc.to_array(cols), - dtype='uint64') + doc.to_array(cols), + dtype='uint64') feats.append(doc.user_data['cached_feats']) + assert feats[-1].dtype == 'uint64' return feats, None model = layerize(forward) model.cols = cols