diff --git a/spacy/cli/train.py b/spacy/cli/train.py index fb96e6c05..34117db22 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -32,6 +32,7 @@ numpy.random.seed(0) n_sents=("number of sentences", "option", "ns", int), use_gpu=("Use GPU", "option", "g", int), vectors=("Model to load vectors from", "option", "v"), + vectors_limit=("Truncate to N vectors (requires -v)", "option", None, int), no_tagger=("Don't train tagger", "flag", "T", bool), no_parser=("Don't train parser", "flag", "P", bool), no_entities=("Don't train NER", "flag", "N", bool), @@ -40,9 +41,9 @@ numpy.random.seed(0) meta_path=("Optional path to meta.json. All relevant properties will be " "overwritten.", "option", "m", Path)) def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, - use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, - no_entities=False, gold_preproc=False, version="0.0.0", - meta_path=None): + use_gpu=-1, vectors=None, vectors_limit=None, no_tagger=False, + no_parser=False, no_entities=False, gold_preproc=False, + version="0.0.0", meta_path=None): """ Train a model. Expects data in spaCy's JSON format. """ @@ -94,6 +95,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, nlp.meta.update(meta) if vectors: util.load_model(vectors, vocab=nlp.vocab) + if vectors_limit is not None: + nlp.vocab.prune_vectors(vectors_limit) for name in pipeline: nlp.add_pipe(nlp.create_pipe(name), name=name) optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 160f0d5bd..ff6c5b844 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -5,6 +5,7 @@ import numpy import dill from collections import OrderedDict +from thinc.neural.util import get_array_module from .lexeme cimport EMPTY_LEXEME from .lexeme cimport Lexeme from .strings cimport hash_string @@ -247,6 +248,44 @@ cdef class Vocab: width = self.vectors.data.shape[1] self.vectors = Vectors(self.strings, width=width) + def prune_vectors(self, nr_row, batch_size=1024): + """Reduce the current vector table to `nr_row` unique entries. Words + mapped to the discarded vectors will be remapped to the closest vector + among those remaining. + + For example, suppose the original table had vectors for the words: + ['sat', 'cat', 'feline', 'reclined']. If we prune the vector table to, + two rows, we would discard the vectors for 'feline' and 'reclined'. + These words would then be remapped to the closest remaining vector + -- so "feline" would have the same vector as "cat", and "reclined" + would have the same vector as "sat". + + The similarities are judged by cosine. The original vectors may + be large, so the cosines are calculated in minibatches, to reduce + memory usage. + """ + xp = get_array_module(self.vectors.data) + # Work in batches, to avoid memory problems. + keep = self.vectors.data[:nr_row] + toss = self.vectors.data[nr_row:] + # Normalize the vectors, so cosine similarity is just dot product. + # Note we can't modify the ones we're keeping in-place... + keep = keep / (xp.linalg.norm(keep)+1e-8) + keep = xp.ascontiguousarray(keep.T) + neighbours = xp.zeros((toss.shape[0],), dtype='i') + for i in range(0, toss.shape[0], batch_size): + batch = toss[i : i+batch_size] + batch /= xp.linalg.norm(batch)+1e-8 + neighbours[i:i+batch_size] = xp.dot(batch, keep).argmax(axis=1) + for lex in self: + # If we're losing the vector for this word, map it to the nearest + # vector we're keeping. + if lex.rank >= nr_row: + lex.rank = neighbours[lex.rank-nr_row] + self.vectors.add(lex.orth, row=lex.rank) + # Make copy, to encourage the original table to be garbage collected. + self.vectors.data = xp.ascontiguousarray(self.vectors.data[:nr_row]) + def get_vector(self, orth): """Retrieve a vector for a word in the vocabulary. Words can be looked up by string or int ID. If no vectors data is loaded, ValueError is