diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py
index 18589a954..7fdd39932 100644
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@@ -17,9 +17,10 @@ from wasabi import msg
 
 from ..vectors import Vectors
 from ..errors import Errors, Warnings
-from ..util import ensure_path, get_lang_class, OOV_RANK
+from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
 from ..lookups import Lookups
 
+
 try:
     import ftfy
 except ImportError:
@@ -51,6 +52,7 @@ DEFAULT_OOV_PROB = -20
     ),
     model_name=("Optional name for the model meta", "option", "mn", str),
     omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool),
+    base_model=("Base model (for languages with custom tokenizers)", "option", "b", str),
 )
 def init_model(
     lang,
@@ -64,6 +66,7 @@ def init_model(
     vectors_name=None,
     model_name=None,
     omit_extra_lookups=False,
+    base_model=None,
 ):
     """
     Create a new model from raw data, like word frequencies, Brown clusters
@@ -95,7 +98,7 @@ def init_model(
         lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc)
 
     with msg.loading("Creating model..."):
-        nlp = create_model(lang, lex_attrs, name=model_name)
+        nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
 
     # Create empty extra lexeme tables so the data from spacy-lookups-data
     # isn't loaded if these features are accessed
@@ -164,9 +167,16 @@ def read_attrs_from_deprecated(freqs_loc, clusters_loc):
     return lex_attrs
 
 
-def create_model(lang, lex_attrs, name=None):
-    lang_class = get_lang_class(lang)
-    nlp = lang_class()
+def create_model(lang, lex_attrs, name=None, base_model=None):
+    if base_model:
+        nlp = load_model(base_model)
+        # keep the tokenizer but remove any existing pipeline components due to
+        # potentially conflicting vectors
+        for pipe in nlp.pipe_names:
+            nlp.remove_pipe(pipe)
+    else:
+        lang_class = get_lang_class(lang)
+        nlp = lang_class()
     for lexeme in nlp.vocab:
         lexeme.rank = OOV_RANK
     for attrs in lex_attrs:
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index d5c6bf2a8..fafa492c6 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -9,7 +9,6 @@ import numpy
 cimport cython.parallel
 import numpy.random
 cimport numpy as np
-from itertools import islice
 from cpython.ref cimport PyObject, Py_XDECREF
 from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 from libc.math cimport exp
@@ -621,15 +620,15 @@ cdef class Parser:
             self.model, cfg = self.Model(self.moves.n_moves, **cfg)
             if sgd is None:
                 sgd = self.create_optimizer()
-            doc_sample = []
-            gold_sample = []
-            for raw_text, annots_brackets in islice(get_gold_tuples(), 1000):
+            docs = []
+            golds = []
+            for raw_text, annots_brackets in get_gold_tuples():
                 for annots, brackets in annots_brackets:
                     ids, words, tags, heads, deps, ents = annots
-                    doc_sample.append(Doc(self.vocab, words=words))
-                    gold_sample.append(GoldParse(doc_sample[-1], words=words, tags=tags,
-                                                 heads=heads, deps=deps, entities=ents))
-            self.model.begin_training(doc_sample, gold_sample)
+                    docs.append(Doc(self.vocab, words=words))
+                    golds.append(GoldParse(docs[-1], words=words, tags=tags,
+                                           heads=heads, deps=deps, entities=ents))
+            self.model.begin_training(docs, golds)
             if pipeline is not None:
                 self.init_multitask_objectives(get_gold_tuples, pipeline, sgd=sgd, **cfg)
             link_vectors_to_models(self.vocab)