From 8fdb9bc278b950436d6a6d28cb27ef093fca9560 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 15 Nov 2018 22:17:16 +0100
Subject: [PATCH 1/8] =?UTF-8?q?=F0=9F=92=AB=20Add=20experimental=20ULMFit/?=
 =?UTF-8?q?BERT/Elmo-like=20pretraining=20=20(#2931)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add 'spacy pretrain' command

* Fix pretrain command for Python 2

* Fix pretrain command

* Fix pretrain command
---
 spacy/__main__.py     |   3 +-
 spacy/cli/__init__.py |   1 +
 spacy/cli/pretrain.py | 188 ++++++++++++++++++++++++++++++++++++++++++
 spacy/cli/train.py    |  21 ++++-
 4 files changed, 211 insertions(+), 2 deletions(-)
 create mode 100644 spacy/cli/pretrain.py

diff --git a/spacy/__main__.py b/spacy/__main__.py
index 897d890c2..5d712ea15 100644
--- a/spacy/__main__.py
+++ b/spacy/__main__.py
@@ -6,7 +6,7 @@ from __future__ import print_function
 if __name__ == '__main__':
     import plac
     import sys
-    from spacy.cli import download, link, info, package, train, convert
+    from spacy.cli import download, link, info, package, train, pretrain, convert
     from spacy.cli import vocab, init_model, profile, evaluate, validate
     from spacy.cli import ud_train, ud_evaluate
     from spacy.util import prints
@@ -16,6 +16,7 @@ if __name__ == '__main__':
         'link': link,
         'info': info,
         'train': train,
+        'pretrain': pretrain,
         'ud-train': ud_train,
         'evaluate': evaluate,
         'ud-evaluate': ud_evaluate,
diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py
index 2788ffc86..5497c55ce 100644
--- a/spacy/cli/__init__.py
+++ b/spacy/cli/__init__.py
@@ -4,6 +4,7 @@ from .link import link
 from .package import package
 from .profile import profile
 from .train import train
+from .pretrain import pretrain
 from .evaluate import evaluate
 from .convert import convert
 from .vocab import make_vocab as vocab
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
new file mode 100644
index 000000000..e356c1183
--- /dev/null
+++ b/spacy/cli/pretrain.py
@@ -0,0 +1,188 @@
+'''This script is experimental.
+
+Try pre-training the CNN component of the text categorizer using a cheap
+language modelling-like objective. Specifically, we load pre-trained vectors
+(from something like word2vec, GloVe, FastText etc), and use the CNN to
+predict the tokens' pre-trained vectors. This isn't as easy as it sounds:
+we're not merely doing compression here, because heavy dropout is applied,
+including over the input words. This means the model must often (50% of the time)
+use the context in order to predict the word.
+
+To evaluate the technique, we're pre-training with the 50k texts from the IMDB
+corpus, and then training with only 100 labels. Note that it's a bit dirty to
+pre-train with the development data, but also not *so* terrible: we're not using
+the development labels, after all --- only the unlabelled text.
+'''
+from __future__ import print_function, unicode_literals
+import plac
+import random
+import numpy
+import time
+import ujson as json
+from pathlib import Path
+
+import spacy
+from spacy.attrs import ID
+from spacy.util import minibatch, use_gpu, compounding, ensure_path
+from spacy._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
+from thinc.v2v import Affine
+
+
+def prefer_gpu():
+    used = spacy.util.use_gpu(0)
+    if used is None:
+        return False
+    else:
+        import cupy.random
+        cupy.random.seed(0)
+        return True
+
+
+def load_texts(path):
+    '''Load inputs from a jsonl file.
+    
+    Each line should be a dict like {"text": "..."}
+    '''
+    path = ensure_path(path)
+    with path.open('r', encoding='utf8') as file_:
+        for line in file_:
+            data = json.loads(line)
+            yield data['text']
+
+
+def make_update(model, docs, optimizer, drop=0.):
+    """Perform an update over a single batch of documents.
+
+    docs (iterable): A batch of `Doc` objects.
+    drop (float): The droput rate.
+    optimizer (callable): An optimizer.
+    RETURNS loss: A float for the loss.
+    """
+    predictions, backprop = model.begin_update(docs, drop=drop)
+    loss, gradients = get_vectors_loss(model.ops, docs, predictions)
+    backprop(gradients, sgd=optimizer)
+    return loss
+
+
+def get_vectors_loss(ops, docs, prediction):
+    """Compute a mean-squared error loss between the documents' vectors and
+    the prediction.    
+
+    Note that this is ripe for customization! We could compute the vectors
+    in some other word, e.g. with an LSTM language model, or use some other
+    type of objective.
+    """
+    # The simplest way to implement this would be to vstack the
+    # token.vector values, but that's a bit inefficient, especially on GPU.
+    # Instead we fetch the index into the vectors table for each of our tokens,
+    # and look them up all at once. This prevents data copying.
+    ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
+    target = docs[0].vocab.vectors.data[ids]
+    d_scores = (prediction - target) / prediction.shape[0]
+    loss = (d_scores**2).sum()
+    return loss, d_scores
+
+
+def create_pretraining_model(nlp, tok2vec):
+    '''Define a network for the pretraining. We simply add an output layer onto
+    the tok2vec input model. The tok2vec input model needs to be a model that
+    takes a batch of Doc objects (as a list), and returns a list of arrays.
+    Each array in the output needs to have one row per token in the doc.
+    '''
+    output_size = nlp.vocab.vectors.data.shape[1]
+    output_layer = zero_init(Affine(output_size, drop_factor=0.0))
+    model = chain(
+        tok2vec,
+        flatten,
+        output_layer
+    )
+    model.output_layer = output_layer
+    model.begin_training([nlp.make_doc('Give it a doc to infer shapes')])
+    return model
+
+
+class ProgressTracker(object):
+    def __init__(self, frequency=10000):
+        self.loss = 0.
+        self.nr_word = 0
+        self.frequency = frequency
+        self.last_time = time.time()
+        self.last_update = 0
+
+    def update(self, epoch, loss, docs):
+        self.loss += loss
+        self.nr_word += sum(len(doc) for doc in docs)
+        words_since_update = self.nr_word - self.last_update
+        if words_since_update >= self.frequency:
+            wps = words_since_update / (time.time() - self.last_time)
+            self.last_update = self.nr_word
+            self.last_time = time.time()
+            status = (epoch, self.nr_word, '%.5f' % self.loss, int(wps))
+            return status
+        else:
+            return None
+
+
+@plac.annotations(
+    texts_loc=("Path to jsonl file with texts to learn from", "positional", None, str),
+    vectors_model=("Name or path to vectors model to learn from"),
+    output_dir=("Directory to write models each epoch", "positional", None, str),
+    width=("Width of CNN layers", "option", "cw", int),
+    depth=("Depth of CNN layers", "option", "cd", int),
+    embed_rows=("Embedding rows", "option", "er", int),
+    dropout=("Dropout", "option", "d", float),
+    seed=("Seed for random number generators", "option", "s", float),
+    nr_iter=("Number of iterations to pretrain", "option", "i", int),
+)
+def pretrain(texts_loc, vectors_model, output_dir, width=128, depth=4,
+        embed_rows=1000, dropout=0.2, nr_iter=1, seed=0):
+    """
+    Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
+    using an approximate language-modelling objective. Specifically, we load
+    pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict
+    vectors which match the pre-trained ones. The weights are saved to a directory
+    after each epoch. You can then pass a path to one of these pre-trained weights
+    files to the 'spacy train' command.
+
+    This technique may be especially helpful if you have little labelled data.
+    However, it's still quite experimental, so your mileage may vary.
+
+    To load the weights back in during 'spacy train', you need to ensure
+    all settings are the same between pretraining and training. The API and
+    errors around this need some improvement.
+    """
+    config = dict(locals())
+    output_dir = ensure_path(output_dir)
+    random.seed(seed)
+    numpy.random.seed(seed)
+    if not output_dir.exists():
+        output_dir.mkdir()
+    with (output_dir / 'config.json').open('w') as file_:
+        file_.write(json.dumps(config))
+    has_gpu = prefer_gpu()
+    nlp = spacy.load(vectors_model)
+    tok2vec = Tok2Vec(width, embed_rows,
+                conv_depth=depth,
+                pretrained_vectors=nlp.vocab.vectors.name,
+                bilstm_depth=0, # Requires PyTorch. Experimental.
+                cnn_maxout_pieces=2, # You can try setting this higher
+                subword_features=True) # Set to False for character models, e.g. Chinese
+    model = create_pretraining_model(nlp, tok2vec)
+    optimizer = create_default_optimizer(model.ops)
+    tracker = ProgressTracker()
+    texts = list(load_texts(texts_loc))
+    print('Epoch', '#Words', 'Loss', 'w/s')
+    for epoch in range(nr_iter):
+        random.shuffle(texts)
+        for batch in minibatch(texts):
+            docs = [nlp.make_doc(text) for text in batch]
+            loss = make_update(model, docs, optimizer, drop=dropout)
+            progress = tracker.update(epoch, loss, docs)
+            if progress:
+                print(*progress)
+        with model.use_params(optimizer.averages):
+            with (output_dir / ('model%d.bin' % epoch)).open('wb') as file_:
+                file_.write(tok2vec.to_bytes())
+            with (output_dir / 'log.jsonl').open('a') as file_:
+                file_.write(json.dumps({'nr_word': tracker.nr_word,
+                    'loss': tracker.loss, 'epoch': epoch}))
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index a0fb4d28a..ccd404db3 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -40,9 +40,11 @@ from ..compat import json_dumps
     version=("Model version", "option", "V", str),
     meta_path=("Optional path to meta.json. All relevant properties will be "
                "overwritten.", "option", "m", Path),
+    init_tok2vec=("Path to pretrained weights for the token-to-vector parts "
+        "of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path),
     verbose=("Display more information for debug", "option", None, bool))
 def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
-         parser_multitasks='', entity_multitasks='',
+         parser_multitasks='', entity_multitasks='', init_tok2vec=None,
           use_gpu=-1, vectors=None, no_tagger=False, noise_level=0.0,
           no_parser=False, no_entities=False, gold_preproc=False,
           version="0.0.0", meta_path=None, verbose=False):
@@ -120,6 +122,9 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
         for objective in entity_multitasks.split(','):
             nlp.entity.add_multitask_objective(objective)
     optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
+    if init_tok2vec is not None:
+        loaded = _load_pretrained_tok2vec(nlp, init_tok2vec)
+        print("Loaded pretrained tok2vec for:", loaded)
     nlp._optimizer = None
 
     print("Itn.  Dep Loss  NER Loss  UAS     NER P.  NER R.  NER F.  Tag %   Token %  CPU WPS  GPU WPS")
@@ -199,6 +204,20 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
     _collate_best_model(meta, output_path, components)
 
 
+def _load_pretrained_tok2vec(nlp, loc):
+    """Load pre-trained weights for the 'token-to-vector' part of the component
+    models, which is typically a CNN. See 'spacy pretrain'. Experimental.
+    """
+    with loc.open('rb') as file_:
+        weights_data = file_.read()
+    loaded = []
+    for name, component in nlp.pipeline:
+        if hasattr(component, 'model') and hasattr(component.model, 'tok2vec'):
+            component.model.tok2vec.from_bytes(weights_data)
+            loaded.append(name)
+    return loaded
+
+
 def _collate_best_model(meta, output_path, components):
     bests = {}
     for component in components:

From 3e7b214e571e0b9b7d41d93f5a0583007b22b831 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 15 Nov 2018 22:44:07 +0000
Subject: [PATCH 2/8] Make pretrain script work with stream from stdin

---
 spacy/cli/pretrain.py | 29 ++++++++++++++++++++---------
 spacy/util.py         | 12 +++++++++---
 2 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index e356c1183..0c849d814 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -20,10 +20,11 @@ import numpy
 import time
 import ujson as json
 from pathlib import Path
+import sys
 
 import spacy
 from spacy.attrs import ID
-from spacy.util import minibatch, use_gpu, compounding, ensure_path
+from spacy.util import minibatch_by_words, use_gpu, compounding, ensure_path
 from spacy._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
 from thinc.v2v import Affine
 
@@ -45,9 +46,13 @@ def load_texts(path):
     '''
     path = ensure_path(path)
     with path.open('r', encoding='utf8') as file_:
-        for line in file_:
-            data = json.loads(line)
-            yield data['text']
+        texts = [json.loads(line)['text'] for line in file_]
+    random.shuffle(texts)
+    return texts
+
+def stream_texts():
+    for line in sys.stdin:
+        yield json.loads(line)['text']
 
 
 def make_update(model, docs, optimizer, drop=0.):
@@ -102,16 +107,19 @@ def create_pretraining_model(nlp, tok2vec):
 
 
 class ProgressTracker(object):
-    def __init__(self, frequency=10000):
+    def __init__(self, frequency=100000):
         self.loss = 0.
         self.nr_word = 0
+        self.words_per_epoch = Counter()
         self.frequency = frequency
         self.last_time = time.time()
         self.last_update = 0
 
     def update(self, epoch, loss, docs):
         self.loss += loss
-        self.nr_word += sum(len(doc) for doc in docs)
+        words_in_batch = sum(len(doc) for doc in docs)
+        self.words_per_epoch[epoch] += words_in_batch
+        self.nr_word += words_in_batch
         words_since_update = self.nr_word - self.last_update
         if words_since_update >= self.frequency:
             wps = words_since_update / (time.time() - self.last_time)
@@ -170,19 +178,22 @@ def pretrain(texts_loc, vectors_model, output_dir, width=128, depth=4,
     model = create_pretraining_model(nlp, tok2vec)
     optimizer = create_default_optimizer(model.ops)
     tracker = ProgressTracker()
-    texts = list(load_texts(texts_loc))
     print('Epoch', '#Words', 'Loss', 'w/s')
+    texts = stream_texts() if text_loc == '-' else load_texts(texts_loc) 
     for epoch in range(nr_iter):
-        random.shuffle(texts)
-        for batch in minibatch(texts):
+        for batch in minibatch_by_words(texts, tuples=False, size=50000):
             docs = [nlp.make_doc(text) for text in batch]
             loss = make_update(model, docs, optimizer, drop=dropout)
             progress = tracker.update(epoch, loss, docs)
             if progress:
                 print(*progress)
+            if texts_loc == '-' and progress.words_per_epoch[epoch] >= 10**7:
+                break
         with model.use_params(optimizer.averages):
             with (output_dir / ('model%d.bin' % epoch)).open('wb') as file_:
                 file_.write(tok2vec.to_bytes())
             with (output_dir / 'log.jsonl').open('a') as file_:
                 file_.write(json.dumps({'nr_word': tracker.nr_word,
                     'loss': tracker.loss, 'epoch': epoch}))
+        if texts_loc != '-':
+            texts = load_texts(texts_loc)
diff --git a/spacy/util.py b/spacy/util.py
index e32f51add..d0d112c91 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -465,7 +465,7 @@ def decaying(start, stop, decay):
         nr_upd += 1
 
 
-def minibatch_by_words(items, size, count_words=len):
+def minibatch_by_words(items, size, tuples=True, count_words=len):
     '''Create minibatches of a given number of words.'''
     if isinstance(size, int):
         size_ = itertools.repeat(size)
@@ -477,13 +477,19 @@ def minibatch_by_words(items, size, count_words=len):
         batch = []
         while batch_size >= 0:
             try:
-                doc, gold = next(items)
+                if tuples:
+                    doc, gold = next(items)
+                else:
+                    doc = next(items)
             except StopIteration:
                 if batch:
                     yield batch
                 return
             batch_size -= count_words(doc)
-            batch.append((doc, gold))
+            if tuples:
+                batch.append((doc, gold))
+            else:
+                batch.append(doc)
         if batch:
             yield batch
 

From 6af6950e46570a78da4e75ea2871bcf1dcf199d9 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 15 Nov 2018 22:45:36 +0000
Subject: [PATCH 3/8] Fix pretrain

---
 spacy/cli/pretrain.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 0c849d814..fea8db8fd 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -21,6 +21,7 @@ import time
 import ujson as json
 from pathlib import Path
 import sys
+from collections import Counter
 
 import spacy
 from spacy.attrs import ID
@@ -179,7 +180,7 @@ def pretrain(texts_loc, vectors_model, output_dir, width=128, depth=4,
     optimizer = create_default_optimizer(model.ops)
     tracker = ProgressTracker()
     print('Epoch', '#Words', 'Loss', 'w/s')
-    texts = stream_texts() if text_loc == '-' else load_texts(texts_loc) 
+    texts = stream_texts() if texts_loc == '-' else load_texts(texts_loc) 
     for epoch in range(nr_iter):
         for batch in minibatch_by_words(texts, tuples=False, size=50000):
             docs = [nlp.make_doc(text) for text in batch]

From f8afaa0c1c7de7b57de7af8efe29fc15b8e74872 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 15 Nov 2018 22:46:53 +0000
Subject: [PATCH 4/8] Fix pretrain

---
 spacy/cli/pretrain.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index fea8db8fd..cb5a524f3 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -188,8 +188,8 @@ def pretrain(texts_loc, vectors_model, output_dir, width=128, depth=4,
             progress = tracker.update(epoch, loss, docs)
             if progress:
                 print(*progress)
-            if texts_loc == '-' and progress.words_per_epoch[epoch] >= 10**7:
-                break
+                if texts_loc == '-' and tracker.words_per_epoch[epoch] >= 10**7:
+                    break
         with model.use_params(optimizer.averages):
             with (output_dir / ('model%d.bin' % epoch)).open('wb') as file_:
                 file_.write(tok2vec.to_bytes())

From 09a02276566b1637f3dfefa05be5dd50d9628902 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 15 Nov 2018 23:18:35 +0000
Subject: [PATCH 5/8] Temporarily add a script to load reddit

---
 bin/load_reddit.py | 87 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)
 create mode 100644 bin/load_reddit.py

diff --git a/bin/load_reddit.py b/bin/load_reddit.py
new file mode 100644
index 000000000..1b534bcc7
--- /dev/null
+++ b/bin/load_reddit.py
@@ -0,0 +1,87 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import bz2
+import regex as re
+import ujson
+import sys
+import random
+import datetime
+import plac
+from pathlib import Path
+
+_unset = object()
+
+
+class Reddit(object):
+    """Stream cleaned comments from Reddit."""
+    pre_format_re = re.compile(r'^[\`\*\~]')
+    post_format_re = re.compile(r'[\`\*\~]$')
+    url_re = re.compile(r'\[([^]]+)\]\(%%URL\)')
+    link_re = re.compile(r'\[([^]]+)\]\(https?://[^\)]+\)')
+
+    def __init__(self, file_path, meta_keys={'subreddit': 'section'}):
+        """
+        file_path (unicode / Path): Path to archive or directory of archives.
+        meta_keys (dict): Meta data key included in the Reddit corpus, mapped
+            to display name in Prodigy meta.
+        RETURNS (Reddit): The Reddit loader.
+        """
+        self.meta = meta_keys
+        file_path = Path(file_path)
+        if not file_path.exists():
+            raise IOError("Can't find file path: {}".format(file_path))
+        if not file_path.is_dir():
+            self.files = [file_path]
+        else:
+            self.files = list(file_path.iterdir())
+
+    def __iter__(self):
+        for file_path in self.iter_files():
+            with bz2.open(str(file_path)) as f:
+                for line in f:
+                    line = line.strip()
+                    if not line:
+                        continue
+                    comment = ujson.loads(line)
+                    if self.is_valid(comment):
+                        text = self.strip_tags(comment['body'])
+                        yield {'text': text}
+
+    def get_meta(self, item):
+        return {name: item.get(key, 'n/a') for key, name in self.meta.items()}
+
+    def iter_files(self):
+        for file_path in self.files:
+            yield file_path
+
+    def strip_tags(self, text):
+        text = self.link_re.sub(r'\1', text)
+        text = text.replace('&gt;', '>').replace('&lt;', '<')
+        text = self.pre_format_re.sub('', text)
+        text = self.post_format_re.sub('', text)
+        text = re.sub(r'\s+', ' ', text)
+        return text.strip()
+
+    def is_valid(self, comment):
+        return comment['body'] is not None \
+            and comment['body'] != '[deleted]' \
+            and comment['body'] != '[removed]'
+
+
+def main(path):
+    reddit = Reddit(path)
+    for comment in reddit:
+        print(ujson.dumps(comment))
+
+
+if __name__ == '__main__':
+    try:
+        plac.call(main)
+    except BrokenPipeError: 
+        import os, sys
+        # Python flushes standard streams on exit; redirect remaining output
+        # to devnull to avoid another BrokenPipeError at shutdown
+        devnull = os.open(os.devnull, os.O_WRONLY)
+        os.dup2(devnull, sys.stdout.fileno())
+        sys.exit(1)  # Python exits with error code 1 on EPIPE

From 2ddd4288349db030e33335dfadc218b950b0e45e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 15 Nov 2018 23:34:35 +0000
Subject: [PATCH 6/8] Fix pretrain script

---
 spacy/cli/pretrain.py | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index cb5a524f3..f46d41452 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -25,7 +25,7 @@ from collections import Counter
 
 import spacy
 from spacy.attrs import ID
-from spacy.util import minibatch_by_words, use_gpu, compounding, ensure_path
+from spacy.util import minibatch, minibatch_by_words, use_gpu, compounding, ensure_path
 from spacy._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
 from thinc.v2v import Affine
 
@@ -85,7 +85,8 @@ def get_vectors_loss(ops, docs, prediction):
     ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
     target = docs[0].vocab.vectors.data[ids]
     d_scores = (prediction - target) / prediction.shape[0]
-    loss = (d_scores**2).sum()
+    # Don't want to return a cupy object here
+    loss = float((d_scores**2).sum())
     return loss, d_scores
 
 
@@ -97,11 +98,16 @@ def create_pretraining_model(nlp, tok2vec):
     '''
     output_size = nlp.vocab.vectors.data.shape[1]
     output_layer = zero_init(Affine(output_size, drop_factor=0.0))
+    # This is annoying, but the parser etc have the flatten step after
+    # the tok2vec. To load the weights in cleanly, we need to match
+    # the shape of the models' components exactly. So what we cann
+    # "tok2vec" has to be the same set of processes as what the components do.
+    tok2vec = chain(tok2vec, flatten)
     model = chain(
         tok2vec,
-        flatten,
         output_layer
     )
+    model.tok2vec = tok2vec
     model.output_layer = output_layer
     model.begin_training([nlp.make_doc('Give it a doc to infer shapes')])
     return model
@@ -144,7 +150,7 @@ class ProgressTracker(object):
     nr_iter=("Number of iterations to pretrain", "option", "i", int),
 )
 def pretrain(texts_loc, vectors_model, output_dir, width=128, depth=4,
-        embed_rows=1000, dropout=0.2, nr_iter=1, seed=0):
+        embed_rows=1000, dropout=0.2, nr_iter=10, seed=0):
     """
     Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
     using an approximate language-modelling objective. Specifically, we load
@@ -170,29 +176,29 @@ def pretrain(texts_loc, vectors_model, output_dir, width=128, depth=4,
         file_.write(json.dumps(config))
     has_gpu = prefer_gpu()
     nlp = spacy.load(vectors_model)
-    tok2vec = Tok2Vec(width, embed_rows,
-                conv_depth=depth,
-                pretrained_vectors=nlp.vocab.vectors.name,
-                bilstm_depth=0, # Requires PyTorch. Experimental.
-                cnn_maxout_pieces=2, # You can try setting this higher
-                subword_features=True) # Set to False for character models, e.g. Chinese
-    model = create_pretraining_model(nlp, tok2vec)
+    model = create_pretraining_model(nlp,
+                Tok2Vec(width, embed_rows,
+                    conv_depth=depth,
+                    pretrained_vectors=nlp.vocab.vectors.name,
+                    bilstm_depth=0, # Requires PyTorch. Experimental.
+                    cnn_maxout_pieces=2, # You can try setting this higher
+                    subword_features=True)) # Set to False for character models, e.g. Chinese
     optimizer = create_default_optimizer(model.ops)
     tracker = ProgressTracker()
     print('Epoch', '#Words', 'Loss', 'w/s')
     texts = stream_texts() if texts_loc == '-' else load_texts(texts_loc) 
     for epoch in range(nr_iter):
-        for batch in minibatch_by_words(texts, tuples=False, size=50000):
+        for batch in minibatch(texts, size=64):
             docs = [nlp.make_doc(text) for text in batch]
             loss = make_update(model, docs, optimizer, drop=dropout)
             progress = tracker.update(epoch, loss, docs)
             if progress:
                 print(*progress)
-                if texts_loc == '-' and tracker.words_per_epoch[epoch] >= 10**7:
+                if texts_loc == '-' and tracker.words_per_epoch[epoch] >= 10**6:
                     break
         with model.use_params(optimizer.averages):
             with (output_dir / ('model%d.bin' % epoch)).open('wb') as file_:
-                file_.write(tok2vec.to_bytes())
+                file_.write(model.tok2vec.to_bytes())
             with (output_dir / 'log.jsonl').open('a') as file_:
                 file_.write(json.dumps({'nr_word': tracker.nr_word,
                     'loss': tracker.loss, 'epoch': epoch}))

From 2874b8efd8a21c5be5601c984c8d231670ac03c4 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 15 Nov 2018 23:34:54 +0000
Subject: [PATCH 7/8] Fix tok2vec loading in spacy train

---
 spacy/cli/train.py         | 2 +-
 spacy/syntax/nn_parser.pyx | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index ccd404db3..01aebfae8 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -213,7 +213,7 @@ def _load_pretrained_tok2vec(nlp, loc):
     loaded = []
     for name, component in nlp.pipeline:
         if hasattr(component, 'model') and hasattr(component.model, 'tok2vec'):
-            component.model.tok2vec.from_bytes(weights_data)
+            component.tok2vec.from_bytes(weights_data)
             loaded.append(name)
     return loaded
 
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 25cab43f8..f421520ce 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -126,6 +126,10 @@ cdef class Parser:
 
     def __reduce__(self):
         return (Parser, (self.vocab, self.moves, self.model), None, None)
+    
+    @property
+    def tok2vec(self):
+        return self.model.tok2vec
 
     @property
     def move_names(self):

From c89fd19f660875e5c9cc7a7ec24c9f7e3977163e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 16 Nov 2018 02:22:05 +0100
Subject: [PATCH 8/8] Hack broken pipe error for Python2

---
 bin/load_reddit.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/bin/load_reddit.py b/bin/load_reddit.py
index 1b534bcc7..73ae0b6b5 100644
--- a/bin/load_reddit.py
+++ b/bin/load_reddit.py
@@ -76,6 +76,11 @@ def main(path):
 
 
 if __name__ == '__main__':
+    import socket
+    try:
+        BrokenPipeError
+    except NameError:
+        BrokenPipeError = socket.error
     try:
         plac.call(main)
     except BrokenPipeError: