spaCy/examples/training/pretrain_textcat.py

"""This script is experimental.

Try pre-training the CNN component of the text categorizer using a cheap
language modelling-like objective. Specifically, we load pretrained vectors
(from something like word2vec, GloVe, FastText etc), and use the CNN to
predict the tokens' pretrained vectors. This isn't as easy as it sounds:
we're not merely doing compression here, because heavy dropout is applied,
including over the input words. This means the model must often (50% of the time)
use the context in order to predict the word.

To evaluate the technique, we're pre-training with the 50k texts from the IMDB
corpus, and then training with only 100 labels. Note that it's a bit dirty to
pre-train with the development data, but also not *so* terrible: we're not using
the development labels, after all --- only the unlabelled text.
"""
import plac
import tqdm
import random
import spacy
import thinc.extra.datasets
from spacy.util import minibatch, use_gpu, compounding
from spacy._ml import Tok2Vec
from spacy.pipeline import TextCategorizer
import numpy


def load_texts(limit=0):
    train, dev = thinc.extra.datasets.imdb()
    train_texts, train_labels = zip(*train)
    dev_texts, dev_labels = zip(*train)
    train_texts = list(train_texts)
    dev_texts = list(dev_texts)
    random.shuffle(train_texts)
    random.shuffle(dev_texts)
    if limit >= 1:
        return train_texts[:limit]
    else:
        return list(train_texts) + list(dev_texts)


def load_textcat_data(limit=0):
    """Load data from the IMDB dataset."""
    # Partition off part of the train data for evaluation
    train_data, eval_data = thinc.extra.datasets.imdb()
    random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    eval_texts, eval_labels = zip(*eval_data)
    cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
    eval_cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in eval_labels]
    return (texts, cats), (eval_texts, eval_cats)


def prefer_gpu():
    used = spacy.util.use_gpu(0)
    if used is None:
        return False
    else:
        import cupy.random

        cupy.random.seed(0)
        return True


def build_textcat_model(tok2vec, nr_class, width):
    from thinc.v2v import Model, Softmax, Maxout
    from thinc.api import flatten_add_lengths, chain
    from thinc.t2v import Pooling, sum_pool, mean_pool, max_pool
    from thinc.misc import Residual, LayerNorm
    from spacy._ml import logistic, zero_init

    with Model.define_operators({">>": chain}):
        model = (
            tok2vec
            >> flatten_add_lengths
            >> Pooling(mean_pool)
            >> Softmax(nr_class, width)
        )
    model.tok2vec = tok2vec
    return model


def block_gradients(model):
    from thinc.api import wrap

    def forward(X, drop=0.0):
        Y, _ = model.begin_update(X, drop=drop)
        return Y, None

    return wrap(forward, model)


def create_pipeline(width, embed_size, vectors_model):
    print("Load vectors")
    nlp = spacy.load(vectors_model)
    print("Start training")
    textcat = TextCategorizer(
        nlp.vocab,
        labels=["POSITIVE", "NEGATIVE"],
        model=build_textcat_model(
            Tok2Vec(width=width, embed_size=embed_size), 2, width
        ),
    )

    nlp.add_pipe(textcat)
    return nlp


def train_tensorizer(nlp, texts, dropout, n_iter):
    tensorizer = nlp.create_pipe("tensorizer")
    nlp.add_pipe(tensorizer)
    optimizer = nlp.begin_training()
    for i in range(n_iter):
        losses = {}
        for i, batch in enumerate(minibatch(tqdm.tqdm(texts))):
            docs = [nlp.make_doc(text) for text in batch]
            tensorizer.update(docs, None, losses=losses, sgd=optimizer, drop=dropout)
        print(losses)
    return optimizer


def train_textcat(nlp, n_texts, n_iter=10):
    textcat = nlp.get_pipe("textcat")
    tok2vec_weights = textcat.model.tok2vec.to_bytes()
    (train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts)
    print(
        "Using {} examples ({} training, {} evaluation)".format(
            n_texts, len(train_texts), len(dev_texts)
        )
    )
    train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        textcat.model.tok2vec.from_bytes(tok2vec_weights)
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        for i in range(n_iter):
            losses = {"textcat": 0.0}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(tqdm.tqdm(train_data), size=2)
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats)
            print(
                "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
                    losses["textcat"],
                    scores["textcat_p"],
                    scores["textcat_r"],
                    scores["textcat_f"],
                )
            )


def evaluate_textcat(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 1e-8
    fp = 1e-8
    tn = 1e-8
    fn = 1e-8
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}


@plac.annotations(
    width=("Width of CNN layers", "positional", None, int),
    embed_size=("Embedding rows", "positional", None, int),
    pretrain_iters=("Number of iterations to pretrain", "option", "pn", int),
    train_iters=("Number of iterations to pretrain", "option", "tn", int),
    train_examples=("Number of labelled examples", "option", "eg", int),
    vectors_model=("Name or path to vectors model to learn from"),
)
def main(
    width,
    embed_size,
    vectors_model,
    pretrain_iters=30,
    train_iters=30,
    train_examples=1000,
):
    random.seed(0)
    numpy.random.seed(0)
    use_gpu = prefer_gpu()
    print("Using GPU?", use_gpu)

    nlp = create_pipeline(width, embed_size, vectors_model)
    print("Load data")
    texts = load_texts(limit=0)
    print("Train tensorizer")
    optimizer = train_tensorizer(nlp, texts, dropout=0.2, n_iter=pretrain_iters)
    print("Train textcat")
    train_textcat(nlp, train_examples, n_iter=train_iters)


if __name__ == "__main__":
    plac.call(main)
Auto-format examples 2018-12-02 03:26:26 +00:00			`"""This script is experimental.`
Improve pretrain textcat example 2018-11-03 17:44:12 +00:00
			`Try pre-training the CNN component of the text categorizer using a cheap`
Use consistent spelling 2019-10-02 08:37:39 +00:00			`language modelling-like objective. Specifically, we load pretrained vectors`
Improve pretrain textcat example 2018-11-03 17:44:12 +00:00			`(from something like word2vec, GloVe, FastText etc), and use the CNN to`
Use consistent spelling 2019-10-02 08:37:39 +00:00			`predict the tokens' pretrained vectors. This isn't as easy as it sounds:`
Improve pretrain textcat example 2018-11-03 17:44:12 +00:00			`we're not merely doing compression here, because heavy dropout is applied,`
			`including over the input words. This means the model must often (50% of the time)`
			`use the context in order to predict the word.`

			`To evaluate the technique, we're pre-training with the 50k texts from the IMDB`
			`corpus, and then training with only 100 labels. Note that it's a bit dirty to`
			`pre-train with the development data, but also not so terrible: we're not using`
			`the development labels, after all --- only the unlabelled text.`
Auto-format examples 2018-12-02 03:26:26 +00:00			`"""`
Add tensorizer training example 2018-11-02 22:52:12 +00:00			`import plac`
Restore tqdm imports (#4804) * set 4.38.0 to minimal version with color bug fix * set imports back to proper place * add upper range for tqdm 2019-12-16 12:12:19 +00:00			`import tqdm`
Work on pretraining script 2018-11-03 12:53:25 +00:00			`import random`
Add tensorizer training example 2018-11-02 22:52:12 +00:00			`import spacy`
			`import thinc.extra.datasets`
Work on pretraining script 2018-11-03 12:53:25 +00:00			`from spacy.util import minibatch, use_gpu, compounding`
			`from spacy._ml import Tok2Vec`
			`from spacy.pipeline import TextCategorizer`
			`import numpy`
Add tensorizer training example 2018-11-02 22:52:12 +00:00

Work on pretraining script 2018-11-03 12:53:25 +00:00			`def load_texts(limit=0):`
Add tensorizer training example 2018-11-02 22:52:12 +00:00			`train, dev = thinc.extra.datasets.imdb()`
Work on pretraining script 2018-11-03 12:53:25 +00:00			`train_texts, train_labels = zip(*train)`
Improve pretrain textcat example 2018-11-03 17:44:12 +00:00			`dev_texts, dev_labels = zip(*train)`
			`train_texts = list(train_texts)`
			`dev_texts = list(dev_texts)`
			`random.shuffle(train_texts)`
			`random.shuffle(dev_texts)`
Work on pretraining script 2018-11-03 12:53:25 +00:00			`if limit >= 1:`
			`return train_texts[:limit]`
			`else:`
Improve pretrain textcat example 2018-11-03 17:44:12 +00:00			`return list(train_texts) + list(dev_texts)`
Work on pretraining script 2018-11-03 12:53:25 +00:00
Add tensorizer training example 2018-11-02 22:52:12 +00:00
Improve pretrain textcat example 2018-11-03 17:44:12 +00:00			`def load_textcat_data(limit=0):`
Work on pretraining script 2018-11-03 12:53:25 +00:00			`"""Load data from the IMDB dataset."""`
			`# Partition off part of the train data for evaluation`
Improve pretrain textcat example 2018-11-03 17:44:12 +00:00			`train_data, eval_data = thinc.extra.datasets.imdb()`
Work on pretraining script 2018-11-03 12:53:25 +00:00			`random.shuffle(train_data)`
			`train_data = train_data[-limit:]`
			`texts, labels = zip(*train_data)`
Improve pretrain textcat example 2018-11-03 17:44:12 +00:00			`eval_texts, eval_labels = zip(*eval_data)`
Auto-format examples 2018-12-02 03:26:26 +00:00			`cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]`
			`eval_cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in eval_labels]`
Improve pretrain textcat example 2018-11-03 17:44:12 +00:00			`return (texts, cats), (eval_texts, eval_cats)`
Add tensorizer training example 2018-11-02 22:52:12 +00:00

Improve train tensorizer script 2018-11-03 10:54:20 +00:00			`def prefer_gpu():`
			`used = spacy.util.use_gpu(0)`
			`if used is None:`
			`return False`
			`else:`
Make pretraining script work without GPU 2018-11-04 16:09:52 +00:00			`import cupy.random`
Auto-format examples 2018-12-02 03:26:26 +00:00
Make pretraining script work without GPU 2018-11-04 16:09:52 +00:00			`cupy.random.seed(0)`
Improve train tensorizer script 2018-11-03 10:54:20 +00:00			`return True`

Work on pretraining script 2018-11-03 12:53:25 +00:00
			`def build_textcat_model(tok2vec, nr_class, width):`
Improve pretrain textcat example 2018-11-04 00:17:09 +00:00			`from thinc.v2v import Model, Softmax, Maxout`
Work on pretraining script 2018-11-03 12:53:25 +00:00			`from thinc.api import flatten_add_lengths, chain`
Improve pretrain textcat example 2018-11-04 00:17:09 +00:00			`from thinc.t2v import Pooling, sum_pool, mean_pool, max_pool`
Work on pretraining script 2018-11-03 12:53:25 +00:00			`from thinc.misc import Residual, LayerNorm`
			`from spacy._ml import logistic, zero_init`

Auto-format examples 2018-12-02 03:26:26 +00:00			`with Model.define_operators({">>": chain}):`
Work on pretraining script 2018-11-03 12:53:25 +00:00			`model = (`
Improve pretrain textcat example 2018-11-03 17:44:12 +00:00			`tok2vec`
Work on pretraining script 2018-11-03 12:53:25 +00:00			`>> flatten_add_lengths`
Improve pretrain textcat example 2018-11-04 00:17:09 +00:00			`>> Pooling(mean_pool)`
			`>> Softmax(nr_class, width)`
Work on pretraining script 2018-11-03 12:53:25 +00:00			`)`
			`model.tok2vec = tok2vec`
			`return model`

Auto-format examples 2018-12-02 03:26:26 +00:00
Work on pretraining script 2018-11-03 12:53:25 +00:00			`def block_gradients(model):`
			`from thinc.api import wrap`
Auto-format examples 2018-12-02 03:26:26 +00:00
			`def forward(X, drop=0.0):`
Work on pretraining script 2018-11-03 12:53:25 +00:00			`Y, _ = model.begin_update(X, drop=drop)`
			`return Y, None`
Auto-format examples 2018-12-02 03:26:26 +00:00
Work on pretraining script 2018-11-03 12:53:25 +00:00			`return wrap(forward, model)`

Auto-format examples 2018-12-02 03:26:26 +00:00
Work on pretraining script 2018-11-03 12:53:25 +00:00			`def create_pipeline(width, embed_size, vectors_model):`
Add tensorizer training example 2018-11-02 22:52:12 +00:00			`print("Load vectors")`
Improve train tensorizer script 2018-11-03 10:54:20 +00:00			`nlp = spacy.load(vectors_model)`
Add tensorizer training example 2018-11-02 22:52:12 +00:00			`print("Start training")`
Auto-format examples 2018-12-02 03:26:26 +00:00			`textcat = TextCategorizer(`
			`nlp.vocab,`
			`labels=["POSITIVE", "NEGATIVE"],`
Work on pretraining script 2018-11-03 12:53:25 +00:00			`model=build_textcat_model(`
Auto-format examples 2018-12-02 03:26:26 +00:00			`Tok2Vec(width=width, embed_size=embed_size), 2, width`
			`),`
			`)`
Work on pretraining script 2018-11-03 12:53:25 +00:00
			`nlp.add_pipe(textcat)`
			`return nlp`

Auto-format examples 2018-12-02 03:26:26 +00:00
Work on pretraining script 2018-11-03 12:53:25 +00:00			`def train_tensorizer(nlp, texts, dropout, n_iter):`
Auto-format examples 2018-12-02 03:26:26 +00:00			`tensorizer = nlp.create_pipe("tensorizer")`
Add tensorizer training example 2018-11-02 22:52:12 +00:00			`nlp.add_pipe(tensorizer)`
			`optimizer = nlp.begin_training()`
Work on pretraining script 2018-11-03 12:53:25 +00:00			`for i in range(n_iter):`
Add tensorizer training example 2018-11-02 22:52:12 +00:00			`losses = {}`
Work on pretraining script 2018-11-03 12:53:25 +00:00			`for i, batch in enumerate(minibatch(tqdm.tqdm(texts))):`
Add tensorizer training example 2018-11-02 22:52:12 +00:00			`docs = [nlp.make_doc(text) for text in batch]`
Work on pretraining script 2018-11-03 12:53:25 +00:00			`tensorizer.update(docs, None, losses=losses, sgd=optimizer, drop=dropout)`
Improve train tensorizer script 2018-11-03 10:54:20 +00:00			`print(losses)`
Work on pretraining script 2018-11-03 12:53:25 +00:00			`return optimizer`

Auto-format examples 2018-12-02 03:26:26 +00:00
Improve pretrain textcat example 2018-11-03 17:44:12 +00:00			`def train_textcat(nlp, n_texts, n_iter=10):`
Auto-format examples 2018-12-02 03:26:26 +00:00			`textcat = nlp.get_pipe("textcat")`
Improve pretrain textcat example 2018-11-03 17:44:12 +00:00			`tok2vec_weights = textcat.model.tok2vec.to_bytes()`
Work on pretraining script 2018-11-03 12:53:25 +00:00			`(train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts)`
Auto-format examples 2018-12-02 03:26:26 +00:00			`print(`
			`"Using {} examples ({} training, {} evaluation)".format(`
			`n_texts, len(train_texts), len(dev_texts)`
			`)`
			`)`
			`train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))`
Work on pretraining script 2018-11-03 12:53:25 +00:00
			`# get names of other pipes to disable them during training`
Auto-format examples 2018-12-02 03:26:26 +00:00			`other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]`
Work on pretraining script 2018-11-03 12:53:25 +00:00			`with nlp.disable_pipes(*other_pipes): # only train textcat`
Improve pretrain textcat example 2018-11-03 17:44:12 +00:00			`optimizer = nlp.begin_training()`
			`textcat.model.tok2vec.from_bytes(tok2vec_weights)`
Work on pretraining script 2018-11-03 12:53:25 +00:00			`print("Training the model...")`
Auto-format examples 2018-12-02 03:26:26 +00:00			`print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))`
Work on pretraining script 2018-11-03 12:53:25 +00:00			`for i in range(n_iter):`
Auto-format examples 2018-12-02 03:26:26 +00:00			`losses = {"textcat": 0.0}`
Work on pretraining script 2018-11-03 12:53:25 +00:00			`# batch up the examples using spaCy's minibatch`
			`batches = minibatch(tqdm.tqdm(train_data), size=2)`
			`for batch in batches:`
			`texts, annotations = zip(*batch)`
Auto-format examples 2018-12-02 03:26:26 +00:00			`nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)`
Work on pretraining script 2018-11-03 12:53:25 +00:00			`with textcat.model.use_params(optimizer.averages):`
			`# evaluate on the dev data split off in load_data()`
			`scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats)`
Auto-format examples 2018-12-02 03:26:26 +00:00			`print(`
			`"{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format( # print a simple table`
			`losses["textcat"],`
			`scores["textcat_p"],`
			`scores["textcat_r"],`
			`scores["textcat_f"],`
			`)`
			`)`
Work on pretraining script 2018-11-03 12:53:25 +00:00

			`def evaluate_textcat(tokenizer, textcat, texts, cats):`
			`docs = (tokenizer(text) for text in texts)`
Improve pretrain textcat example 2018-11-03 17:44:12 +00:00			`tp = 1e-8`
			`fp = 1e-8`
			`tn = 1e-8`
			`fn = 1e-8`
Work on pretraining script 2018-11-03 12:53:25 +00:00			`for i, doc in enumerate(textcat.pipe(docs)):`
			`gold = cats[i]`
			`for label, score in doc.cats.items():`
			`if label not in gold:`
			`continue`
			`if score >= 0.5 and gold[label] >= 0.5:`
Auto-format examples 2018-12-02 03:26:26 +00:00			`tp += 1.0`
Work on pretraining script 2018-11-03 12:53:25 +00:00			`elif score >= 0.5 and gold[label] < 0.5:`
Auto-format examples 2018-12-02 03:26:26 +00:00			`fp += 1.0`
Work on pretraining script 2018-11-03 12:53:25 +00:00			`elif score < 0.5 and gold[label] < 0.5:`
			`tn += 1`
			`elif score < 0.5 and gold[label] >= 0.5:`
			`fn += 1`
			`precision = tp / (tp + fp)`
			`recall = tp / (tp + fn)`
			`f_score = 2 * (precision * recall) / (precision + recall)`
Auto-format examples 2018-12-02 03:26:26 +00:00			`return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}`
Work on pretraining script 2018-11-03 12:53:25 +00:00

			`@plac.annotations(`
			`width=("Width of CNN layers", "positional", None, int),`
			`embed_size=("Embedding rows", "positional", None, int),`
			`pretrain_iters=("Number of iterations to pretrain", "option", "pn", int),`
			`train_iters=("Number of iterations to pretrain", "option", "tn", int),`
			`train_examples=("Number of labelled examples", "option", "eg", int),`
Auto-format examples 2018-12-02 03:26:26 +00:00			`vectors_model=("Name or path to vectors model to learn from"),`
Work on pretraining script 2018-11-03 12:53:25 +00:00			`)`
Auto-format examples 2018-12-02 03:26:26 +00:00			`def main(`
			`width,`
			`embed_size,`
			`vectors_model,`
			`pretrain_iters=30,`
			`train_iters=30,`
			`train_examples=1000,`
			`):`
Work on pretraining script 2018-11-03 12:53:25 +00:00			`random.seed(0)`
			`numpy.random.seed(0)`
			`use_gpu = prefer_gpu()`
			`print("Using GPU?", use_gpu)`

			`nlp = create_pipeline(width, embed_size, vectors_model)`
			`print("Load data")`
			`texts = load_texts(limit=0)`
			`print("Train tensorizer")`
Improve pretrain textcat example 2018-11-03 17:44:12 +00:00			`optimizer = train_tensorizer(nlp, texts, dropout=0.2, n_iter=pretrain_iters)`
Work on pretraining script 2018-11-03 12:53:25 +00:00			`print("Train textcat")`
Improve pretrain textcat example 2018-11-03 17:44:12 +00:00			`train_textcat(nlp, train_examples, n_iter=train_iters)`
Add tensorizer training example 2018-11-02 22:52:12 +00:00
Auto-format examples 2018-12-02 03:26:26 +00:00
			`if __name__ == "__main__":`
Add tensorizer training example 2018-11-02 22:52:12 +00:00			`plac.call(main)`