spaCy/examples/training/train_textcat.py

#!/usr/bin/env python
# coding: utf8
"""Train a multi-label convolutional neural network text classifier on the
IMDB dataset, using the TextCategorizer component. The dataset will be loaded
automatically via Thinc's built-in dataset loader. The model is added to
spacy.pipeline, and predictions are available via `doc.cats`. For more details,
see the documentation:
* Training: https://spacy.io/usage/training
* Text classification: https://spacy.io/usage/text-classification

Compatible with: spaCy v2.0.0+
"""
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import thinc.extra.datasets

import spacy
from spacy.util import minibatch, compounding


@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_texts=("Number of texts to train from", "option", "t", int),
    n_iter=("Number of training iterations", "option", "n", int))
def main(model=None, output_dir=None, n_iter=20, n_texts=2000):
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'textcat' not in nlp.pipe_names:
        textcat = nlp.create_pipe('textcat')
        nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe('textcat')

    # add label to text classifier
    textcat.add_label('POSITIVE')

    # load the IMBD dataset
    print("Loading IMDB data...")
    (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)
    print("Using {} examples ({} training, {} evaluation)"
          .format(n_texts, len(train_texts), len(dev_texts)))
    train_data = list(zip(train_texts,
                          [{'cats': cats} for cats in train_cats]))

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        print("Training the model...")
        print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(train_data, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                           losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
            print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
                  .format(losses['textcat'], scores['textcat_p'],
                          scores['textcat_r'], scores['textcat_f']))

    # test the trained model
    test_text = "This movie sucked"
    doc = nlp(test_text)
    print(test_text, doc.cats)

    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        print(test_text, doc2.cats)


def load_data(limit=0, split=0.8):
    """Load data from the IMDB dataset."""
    # Partition off part of the train data for evaluation
    train_data, _ = thinc.extra.datasets.imdb()
    random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    cats = [{'POSITIVE': bool(y)} for y in labels]
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])


def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 1e-8  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 1e-8  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = 2 * (precision * recall) / (precision + recall)
    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}


if __name__ == '__main__':
    plac.call(main)
Update textcat example 2017-10-26 22:32:19 +00:00			`#!/usr/bin/env python`
			`# coding: utf8`
			`"""Train a multi-label convolutional neural network text classifier on the`
			`IMDB dataset, using the TextCategorizer component. The dataset will be loaded`
Update textcat training example and docs 2017-10-26 22:48:45 +00:00			`automatically via Thinc's built-in dataset loader. The model is added to`
Fix formatting 2017-10-31 23:43:22 +00:00			spacy.pipeline, and predictions are available via `doc.cats`. For more details,
			`see the documentation:`
Get docs ready for v2.0.0 2017-11-07 11:00:43 +00:00			`* Training: https://spacy.io/usage/training`
			`* Text classification: https://spacy.io/usage/text-classification`
Update textcat example 2017-10-26 22:32:19 +00:00
Update examples 2017-11-07 00:22:30 +00:00			`Compatible with: spaCy v2.0.0+`
Update textcat example 2017-10-26 22:32:19 +00:00			`"""`
			`from __future__ import unicode_literals, print_function`
Add example for training text classifier 2017-07-22 18:15:32 +00:00			`import plac`
			`import random`
Update textcat example 2017-10-26 22:32:19 +00:00			`from pathlib import Path`
Add example for training text classifier 2017-07-22 18:15:32 +00:00			`import thinc.extra.datasets`

Update textcat example 2017-10-26 22:32:19 +00:00			`import spacy`
Update and document new util functions 2017-11-06 23:22:43 +00:00			`from spacy.util import minibatch, compounding`
Add example for training text classifier 2017-07-22 18:15:32 +00:00
Update textcat example 2017-10-04 13:12:28 +00:00
Update textcat example 2017-10-26 22:32:19 +00:00			`@plac.annotations(`
			`model=("Model name. Defaults to blank 'en' model.", "option", "m", str),`
			`output_dir=("Optional output directory", "option", "o", Path),`
Update textcat example 2017-11-01 16:09:22 +00:00			`n_texts=("Number of texts to train from", "option", "t", int),`
Update textcat example 2017-10-26 22:32:19 +00:00			`n_iter=("Number of training iterations", "option", "n", int))`
Fix print statements in text classifier example 2017-11-01 15:34:31 +00:00			`def main(model=None, output_dir=None, n_iter=20, n_texts=2000):`
Update textcat example 2017-10-26 22:32:19 +00:00			`if model is not None:`
			`nlp = spacy.load(model) # load existing spaCy model`
			`print("Loaded model '%s'" % model)`
			`else:`
			`nlp = spacy.blank('en') # create blank Language class`
			`print("Created blank 'en' model")`

			`# add the text classifier to the pipeline if it doesn't exist`
			`# nlp.create_pipe works for built-ins that are registered with spaCy`
			`if 'textcat' not in nlp.pipe_names:`
Update textcat example 2017-11-01 16:09:22 +00:00			`textcat = nlp.create_pipe('textcat')`
Update textcat training example and docs 2017-10-26 22:48:45 +00:00			`nlp.add_pipe(textcat, last=True)`
Update textcat example 2017-10-26 22:32:19 +00:00			`# otherwise, get it, so we can add labels to it`
			`else:`
			`textcat = nlp.get_pipe('textcat')`

			`# add label to text classifier`
Update textcat example 2017-11-01 16:09:22 +00:00			`textcat.add_label('POSITIVE')`
Update textcat example 2017-10-26 22:32:19 +00:00
			`# load the IMBD dataset`
			`print("Loading IMDB data...")`
Fix print statements in text classifier example 2017-11-01 15:34:31 +00:00			`(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)`
Fix print statement in textcat training example (resolves #1515) 2017-11-08 16:17:40 +00:00			`print("Using {} examples ({} training, {} evaluation)"`
			`.format(n_texts, len(train_texts), len(dev_texts)))`
Update training examples to use "simple style" 2017-11-06 22:14:04 +00:00			`train_data = list(zip(train_texts,`
			`[{'cats': cats} for cats in train_cats]))`
Update textcat example 2017-10-26 22:32:19 +00:00
			`# get names of other pipes to disable them during training`
			`other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']`
			`with nlp.disable_pipes(*other_pipes): # only train textcat`
Fix begin_training if get_gold_tuples is None 2017-11-01 12:14:31 +00:00			`optimizer = nlp.begin_training()`
Update textcat example 2017-10-26 22:32:19 +00:00			`print("Training the model...")`
			`print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))`
			`for i in range(n_iter):`
			`losses = {}`
			`# batch up the examples using spaCy's minibatch`
Fix print statements in text classifier example 2017-11-01 15:34:31 +00:00			`batches = minibatch(train_data, size=compounding(4., 32., 1.001))`
Update textcat example 2017-10-26 22:32:19 +00:00			`for batch in batches:`
Update training examples to use "simple style" 2017-11-06 22:14:04 +00:00			`texts, annotations = zip(*batch)`
			`nlp.update(texts, annotations, sgd=optimizer, drop=0.2,`
			`losses=losses)`
Update textcat example 2017-10-26 22:32:19 +00:00			`with textcat.model.use_params(optimizer.averages):`
			`# evaluate on the dev data split off in load_data()`
			`scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)`
Fix print statements in text classifier example 2017-11-01 15:34:31 +00:00			`print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}' # print a simple table`
Update textcat example 2017-10-26 22:32:19 +00:00			`.format(losses['textcat'], scores['textcat_p'],`
			`scores['textcat_r'], scores['textcat_f']))`

			`# test the trained model`
			`test_text = "This movie sucked"`
			`doc = nlp(test_text)`
			`print(test_text, doc.cats)`

			`if output_dir is not None:`
			`output_dir = Path(output_dir)`
			`if not output_dir.exists():`
			`output_dir.mkdir()`
			`nlp.to_disk(output_dir)`
			`print("Saved model to", output_dir)`

			`# test the saved model`
			`print("Loading from", output_dir)`
			`nlp2 = spacy.load(output_dir)`
			`doc2 = nlp2(test_text)`
			`print(test_text, doc2.cats)`


			`def load_data(limit=0, split=0.8):`
			`"""Load data from the IMDB dataset."""`
			`# Partition off part of the train data for evaluation`
			`train_data, _ = thinc.extra.datasets.imdb()`
			`random.shuffle(train_data)`
			`train_data = train_data[-limit:]`
			`texts, labels = zip(*train_data)`
			`cats = [{'POSITIVE': bool(y)} for y in labels]`
			`split = int(len(train_data) * split)`
			`return (texts[:split], cats[:split]), (texts[split:], cats[split:])`
Add example for training text classifier 2017-07-22 18:15:32 +00:00

			`def evaluate(tokenizer, textcat, texts, cats):`
			`docs = (tokenizer(text) for text in texts)`
Update textcat example 2017-10-26 22:32:19 +00:00			`tp = 1e-8 # True positives`
			`fp = 1e-8 # False positives`
			`fn = 1e-8 # False negatives`
			`tn = 1e-8 # True negatives`
Add example for training text classifier 2017-07-22 18:15:32 +00:00			`for i, doc in enumerate(textcat.pipe(docs)):`
			`gold = cats[i]`
			`for label, score in doc.cats.items():`
Fix multi-label support for text classification The TextCategorizer class is supposed to support multi-label text classification, and allow training data to contain missing values. For this to work, the gradient of the loss should be 0 when labels are missing. Instead, there was no way to actually denote "missing" in the GoldParse class, and so the TextCategorizer class treated the label set within gold.cats as complete. To fix this, we change GoldParse.cats to be a dict instead of a list. The GoldParse.cats dict should map to floats, with 1. denoting 'present' and 0. denoting 'absent'. Gradients are zeroed for categories absent from the gold.cats dict. A nice bonus is that you can also set values between 0 and 1 for partial membership. You can also set numeric values, if you're using a text classification model that uses an appropriate loss function. Unfortunately this is a breaking change; although the functionality was only recently introduced and hasn't been properly documented yet. I've updated the example script accordingly. 2017-10-05 23:43:02 +00:00			`if label not in gold:`
			`continue`
			`if score >= 0.5 and gold[label] >= 0.5:`
Add example for training text classifier 2017-07-22 18:15:32 +00:00			`tp += 1.`
Fix multi-label support for text classification The TextCategorizer class is supposed to support multi-label text classification, and allow training data to contain missing values. For this to work, the gradient of the loss should be 0 when labels are missing. Instead, there was no way to actually denote "missing" in the GoldParse class, and so the TextCategorizer class treated the label set within gold.cats as complete. To fix this, we change GoldParse.cats to be a dict instead of a list. The GoldParse.cats dict should map to floats, with 1. denoting 'present' and 0. denoting 'absent'. Gradients are zeroed for categories absent from the gold.cats dict. A nice bonus is that you can also set values between 0 and 1 for partial membership. You can also set numeric values, if you're using a text classification model that uses an appropriate loss function. Unfortunately this is a breaking change; although the functionality was only recently introduced and hasn't been properly documented yet. I've updated the example script accordingly. 2017-10-05 23:43:02 +00:00			`elif score >= 0.5 and gold[label] < 0.5:`
Add example for training text classifier 2017-07-22 18:15:32 +00:00			`fp += 1.`
Fix multi-label support for text classification The TextCategorizer class is supposed to support multi-label text classification, and allow training data to contain missing values. For this to work, the gradient of the loss should be 0 when labels are missing. Instead, there was no way to actually denote "missing" in the GoldParse class, and so the TextCategorizer class treated the label set within gold.cats as complete. To fix this, we change GoldParse.cats to be a dict instead of a list. The GoldParse.cats dict should map to floats, with 1. denoting 'present' and 0. denoting 'absent'. Gradients are zeroed for categories absent from the gold.cats dict. A nice bonus is that you can also set values between 0 and 1 for partial membership. You can also set numeric values, if you're using a text classification model that uses an appropriate loss function. Unfortunately this is a breaking change; although the functionality was only recently introduced and hasn't been properly documented yet. I've updated the example script accordingly. 2017-10-05 23:43:02 +00:00			`elif score < 0.5 and gold[label] < 0.5:`
Add example for training text classifier 2017-07-22 18:15:32 +00:00			`tn += 1`
Fix multi-label support for text classification The TextCategorizer class is supposed to support multi-label text classification, and allow training data to contain missing values. For this to work, the gradient of the loss should be 0 when labels are missing. Instead, there was no way to actually denote "missing" in the GoldParse class, and so the TextCategorizer class treated the label set within gold.cats as complete. To fix this, we change GoldParse.cats to be a dict instead of a list. The GoldParse.cats dict should map to floats, with 1. denoting 'present' and 0. denoting 'absent'. Gradients are zeroed for categories absent from the gold.cats dict. A nice bonus is that you can also set values between 0 and 1 for partial membership. You can also set numeric values, if you're using a text classification model that uses an appropriate loss function. Unfortunately this is a breaking change; although the functionality was only recently introduced and hasn't been properly documented yet. I've updated the example script accordingly. 2017-10-05 23:43:02 +00:00			`elif score < 0.5 and gold[label] >= 0.5:`
Add example for training text classifier 2017-07-22 18:15:32 +00:00			`fn += 1`
Update textcat example 2017-10-26 22:32:19 +00:00			`precision = tp / (tp + fp)`
Add example for training text classifier 2017-07-22 18:15:32 +00:00			`recall = tp / (tp + fn)`
Update textcat example 2017-10-26 22:32:19 +00:00			`f_score = 2 * (precision * recall) / (precision + recall)`
			`return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}`
Finish text classifier example 2017-07-22 22:34:12 +00:00
Add example for training text classifier 2017-07-22 18:15:32 +00:00
			`if __name__ == '__main__':`
			`plac.call(main)`