From 6e641f46d49fcdc88e3d0cbcefa5c4860e2cd0ea Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 12 Nov 2017 00:43:41 +0100 Subject: [PATCH] Create a preprocess function that gets bigrams --- spacy/_ml.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 9811ce36a..e5d1cfc63 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -92,14 +92,29 @@ def _zero_init(model): @layerize def _preprocess_doc(docs, drop=0.): - keys = [doc.to_array([LOWER]) for doc in docs] + keys = [doc.to_array(LOWER) for doc in docs] ops = Model.ops # The dtype here matches what thinc is expecting -- which differs per # platform (by int definition). This should be fixed once the problem # is fixed on Thinc's side. lengths = ops.asarray([arr.shape[0] for arr in keys], dtype=numpy.int_) keys = ops.xp.concatenate(keys) - vals = ops.allocate(keys.shape[0]) + 1 + vals = ops.allocate(keys.shape) + 1. + return (keys, vals, lengths), None + +@layerize +def _preprocess_doc_bigrams(docs, drop=0.): + unigrams = [doc.to_array(LOWER) for doc in docs] + ops = Model.ops + bigrams = [ops.ngrams(2, doc_unis) for doc_unis in unigrams] + keys = [ops.xp.concatenate(feats) for feats in zip(unigrams, bigrams)] + keys, vals = zip(*[ops.xp.unique(k, return_counts=True) for k in keys]) + # The dtype here matches what thinc is expecting -- which differs per + # platform (by int definition). This should be fixed once the problem + # is fixed on Thinc's side. + lengths = ops.asarray([arr.shape[0] for arr in keys], dtype=numpy.int_) + keys = ops.xp.concatenate(keys) + vals = ops.asarray(ops.xp.concatenate(vals), dtype='f') return (keys, vals, lengths), None @@ -514,8 +529,9 @@ def build_text_classifier(nr_class, width=64, **cfg): linear_model = ( _preprocess_doc - >> LinearModel(nr_class, drop_factor=0.) + >> LinearModel(nr_class) ) + #model = linear_model >> logistic model = ( (linear_model | cnn_model)