spaCy/examples/training/train_textcat.py

'''Train a multi-label convolutional neural network text classifier,
using the spacy.pipeline.TextCategorizer component. The model is then added
to spacy.pipeline, and predictions are available at `doc.cats`.
'''
from __future__ import unicode_literals
import plac
import random
import tqdm

from thinc.neural.optimizers import Adam
from thinc.neural.ops import NumpyOps
import thinc.extra.datasets

import spacy.lang.en
from spacy.gold import GoldParse, minibatch
from spacy.util import compounding
from spacy.pipeline import TextCategorizer

# TODO: Remove this once we're not supporting models trained with thinc <6.9.0
import thinc.neural._classes.layernorm
thinc.neural._classes.layernorm.set_compat_six_eight(False)


def train_textcat(tokenizer, textcat,
                  train_texts, train_cats, dev_texts, dev_cats,
                  n_iter=20):
    '''
    Train the TextCategorizer without associated pipeline.
    '''
    textcat.begin_training()
    optimizer = Adam(NumpyOps(), 0.001)
    train_docs = [tokenizer(text) for text in train_texts]
    train_gold = [GoldParse(doc, cats=cats) for doc, cats in
                  zip(train_docs, train_cats)]
    train_data = list(zip(train_docs, train_gold))
    batch_sizes = compounding(4., 128., 1.001)
    for i in range(n_iter):
        losses = {}
        # Progress bar and minibatching
        batches = minibatch(tqdm.tqdm(train_data, leave=False), size=batch_sizes)
        for batch in batches:
            docs, golds = zip(*batch)
            textcat.update(docs, golds, sgd=optimizer, drop=0.2,
                losses=losses)
        with textcat.model.use_params(optimizer.averages):
            scores = evaluate(tokenizer, textcat, dev_texts, dev_cats)
        yield losses['textcat'], scores


def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 1e-8 # True positives
    fp = 1e-8 # False positives
    fn = 1e-8 # False negatives
    tn = 1e-8 # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precis = tp / (tp + fp)
    recall = tp / (tp + fn)
    fscore = 2 * (precis * recall) / (precis + recall)
    return {'textcat_p': precis, 'textcat_r': recall, 'textcat_f': fscore}


def load_data(limit=0):
    # Partition off part of the train data --- avoid running experiments
    # against test.
    train_data, _ = thinc.extra.datasets.imdb()

    random.shuffle(train_data)
    train_data = train_data[-limit:]

    texts, labels = zip(*train_data)
    cats = [{'POSITIVE': bool(y)} for y in labels]

    split = int(len(train_data) * 0.8)

    train_texts = texts[:split]
    train_cats = cats[:split]
    dev_texts = texts[split:]
    dev_cats = cats[split:]
    return (train_texts, train_cats), (dev_texts, dev_cats)


def main(model_loc=None):
    nlp = spacy.lang.en.English()
    tokenizer = nlp.tokenizer
    textcat = TextCategorizer(tokenizer.vocab, labels=['POSITIVE'])

    print("Load IMDB data")
    (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=2000)

    print("Itn.\tLoss\tP\tR\tF")
    progress = '{i:d} {loss:.3f} {textcat_p:.3f} {textcat_r:.3f} {textcat_f:.3f}'

    for i, (loss, scores) in enumerate(train_textcat(tokenizer, textcat,
                                       train_texts, train_cats,
                                       dev_texts, dev_cats, n_iter=20)):
        print(progress.format(i=i, loss=loss, **scores))
    # How to save, load and use
    nlp.pipeline.append(textcat)
    if model_loc is not None:
        nlp.to_disk(model_loc)

        nlp = spacy.load(model_loc)
        doc = nlp(u'This movie sucked!')
        print(doc.cats)


if __name__ == '__main__':
    plac.call(main)
Update textcat exampe 2017-10-04 12:55:30 +00:00			`'''Train a multi-label convolutional neural network text classifier,`
			`using the spacy.pipeline.TextCategorizer component. The model is then added`
			to spacy.pipeline, and predictions are available at `doc.cats`.
			`'''`
Add example for training text classifier 2017-07-22 18:15:32 +00:00			`from __future__ import unicode_literals`
			`import plac`
			`import random`
			`import tqdm`

			`from thinc.neural.optimizers import Adam`
			`from thinc.neural.ops import NumpyOps`
			`import thinc.extra.datasets`

			`import spacy.lang.en`
			`from spacy.gold import GoldParse, minibatch`
			`from spacy.util import compounding`
			`from spacy.pipeline import TextCategorizer`

Update textcat example 2017-10-04 13:12:28 +00:00			`# TODO: Remove this once we're not supporting models trained with thinc <6.9.0`
			`import thinc.neural._classes.layernorm`
			`thinc.neural._classes.layernorm.set_compat_six_eight(False)`


Add example for training text classifier 2017-07-22 18:15:32 +00:00			`def train_textcat(tokenizer, textcat,`
			`train_texts, train_cats, dev_texts, dev_cats,`
			`n_iter=20):`
			`'''`
			`Train the TextCategorizer without associated pipeline.`
			`'''`
			`textcat.begin_training()`
			`optimizer = Adam(NumpyOps(), 0.001)`
			`train_docs = [tokenizer(text) for text in train_texts]`
			`train_gold = [GoldParse(doc, cats=cats) for doc, cats in`
			`zip(train_docs, train_cats)]`
Update textcat example 2017-10-04 13:12:28 +00:00			`train_data = list(zip(train_docs, train_gold))`
Add example for training text classifier 2017-07-22 18:15:32 +00:00			`batch_sizes = compounding(4., 128., 1.001)`
			`for i in range(n_iter):`
			`losses = {}`
Update textcat example 2017-10-04 13:12:28 +00:00			`# Progress bar and minibatching`
			`batches = minibatch(tqdm.tqdm(train_data, leave=False), size=batch_sizes)`
			`for batch in batches:`
Add example for training text classifier 2017-07-22 18:15:32 +00:00			`docs, golds = zip(*batch)`
Update textcat exampe 2017-10-04 12:55:30 +00:00			`textcat.update(docs, golds, sgd=optimizer, drop=0.2,`
Add example for training text classifier 2017-07-22 18:15:32 +00:00			`losses=losses)`
			`with textcat.model.use_params(optimizer.averages):`
			`scores = evaluate(tokenizer, textcat, dev_texts, dev_cats)`
			`yield losses['textcat'], scores`


			`def evaluate(tokenizer, textcat, texts, cats):`
			`docs = (tokenizer(text) for text in texts)`
			`tp = 1e-8 # True positives`
			`fp = 1e-8 # False positives`
			`fn = 1e-8 # False negatives`
			`tn = 1e-8 # True negatives`
			`for i, doc in enumerate(textcat.pipe(docs)):`
			`gold = cats[i]`
			`for label, score in doc.cats.items():`
Fix multi-label support for text classification The TextCategorizer class is supposed to support multi-label text classification, and allow training data to contain missing values. For this to work, the gradient of the loss should be 0 when labels are missing. Instead, there was no way to actually denote "missing" in the GoldParse class, and so the TextCategorizer class treated the label set within gold.cats as complete. To fix this, we change GoldParse.cats to be a dict instead of a list. The GoldParse.cats dict should map to floats, with 1. denoting 'present' and 0. denoting 'absent'. Gradients are zeroed for categories absent from the gold.cats dict. A nice bonus is that you can also set values between 0 and 1 for partial membership. You can also set numeric values, if you're using a text classification model that uses an appropriate loss function. Unfortunately this is a breaking change; although the functionality was only recently introduced and hasn't been properly documented yet. I've updated the example script accordingly. 2017-10-05 23:43:02 +00:00			`if label not in gold:`
			`continue`
			`if score >= 0.5 and gold[label] >= 0.5:`
Add example for training text classifier 2017-07-22 18:15:32 +00:00			`tp += 1.`
Fix multi-label support for text classification The TextCategorizer class is supposed to support multi-label text classification, and allow training data to contain missing values. For this to work, the gradient of the loss should be 0 when labels are missing. Instead, there was no way to actually denote "missing" in the GoldParse class, and so the TextCategorizer class treated the label set within gold.cats as complete. To fix this, we change GoldParse.cats to be a dict instead of a list. The GoldParse.cats dict should map to floats, with 1. denoting 'present' and 0. denoting 'absent'. Gradients are zeroed for categories absent from the gold.cats dict. A nice bonus is that you can also set values between 0 and 1 for partial membership. You can also set numeric values, if you're using a text classification model that uses an appropriate loss function. Unfortunately this is a breaking change; although the functionality was only recently introduced and hasn't been properly documented yet. I've updated the example script accordingly. 2017-10-05 23:43:02 +00:00			`elif score >= 0.5 and gold[label] < 0.5:`
Add example for training text classifier 2017-07-22 18:15:32 +00:00			`fp += 1.`
Fix multi-label support for text classification The TextCategorizer class is supposed to support multi-label text classification, and allow training data to contain missing values. For this to work, the gradient of the loss should be 0 when labels are missing. Instead, there was no way to actually denote "missing" in the GoldParse class, and so the TextCategorizer class treated the label set within gold.cats as complete. To fix this, we change GoldParse.cats to be a dict instead of a list. The GoldParse.cats dict should map to floats, with 1. denoting 'present' and 0. denoting 'absent'. Gradients are zeroed for categories absent from the gold.cats dict. A nice bonus is that you can also set values between 0 and 1 for partial membership. You can also set numeric values, if you're using a text classification model that uses an appropriate loss function. Unfortunately this is a breaking change; although the functionality was only recently introduced and hasn't been properly documented yet. I've updated the example script accordingly. 2017-10-05 23:43:02 +00:00			`elif score < 0.5 and gold[label] < 0.5:`
Add example for training text classifier 2017-07-22 18:15:32 +00:00			`tn += 1`
Fix multi-label support for text classification The TextCategorizer class is supposed to support multi-label text classification, and allow training data to contain missing values. For this to work, the gradient of the loss should be 0 when labels are missing. Instead, there was no way to actually denote "missing" in the GoldParse class, and so the TextCategorizer class treated the label set within gold.cats as complete. To fix this, we change GoldParse.cats to be a dict instead of a list. The GoldParse.cats dict should map to floats, with 1. denoting 'present' and 0. denoting 'absent'. Gradients are zeroed for categories absent from the gold.cats dict. A nice bonus is that you can also set values between 0 and 1 for partial membership. You can also set numeric values, if you're using a text classification model that uses an appropriate loss function. Unfortunately this is a breaking change; although the functionality was only recently introduced and hasn't been properly documented yet. I've updated the example script accordingly. 2017-10-05 23:43:02 +00:00			`elif score < 0.5 and gold[label] >= 0.5:`
Add example for training text classifier 2017-07-22 18:15:32 +00:00			`fn += 1`
			`precis = tp / (tp + fp)`
			`recall = tp / (tp + fn)`
			`fscore = 2 * (precis * recall) / (precis + recall)`
Fix multi-label support for text classification The TextCategorizer class is supposed to support multi-label text classification, and allow training data to contain missing values. For this to work, the gradient of the loss should be 0 when labels are missing. Instead, there was no way to actually denote "missing" in the GoldParse class, and so the TextCategorizer class treated the label set within gold.cats as complete. To fix this, we change GoldParse.cats to be a dict instead of a list. The GoldParse.cats dict should map to floats, with 1. denoting 'present' and 0. denoting 'absent'. Gradients are zeroed for categories absent from the gold.cats dict. A nice bonus is that you can also set values between 0 and 1 for partial membership. You can also set numeric values, if you're using a text classification model that uses an appropriate loss function. Unfortunately this is a breaking change; although the functionality was only recently introduced and hasn't been properly documented yet. I've updated the example script accordingly. 2017-10-05 23:43:02 +00:00			`return {'textcat_p': precis, 'textcat_r': recall, 'textcat_f': fscore}`
Add example for training text classifier 2017-07-22 18:15:32 +00:00

Update textcat example 2017-10-04 13:12:28 +00:00			`def load_data(limit=0):`
Add example for training text classifier 2017-07-22 18:15:32 +00:00			`# Partition off part of the train data --- avoid running experiments`
			`# against test.`
			`train_data, _ = thinc.extra.datasets.imdb()`

			`random.shuffle(train_data)`
Update textcat example 2017-10-04 13:12:28 +00:00			`train_data = train_data[-limit:]`
Add example for training text classifier 2017-07-22 18:15:32 +00:00
			`texts, labels = zip(*train_data)`
Fix multi-label support for text classification The TextCategorizer class is supposed to support multi-label text classification, and allow training data to contain missing values. For this to work, the gradient of the loss should be 0 when labels are missing. Instead, there was no way to actually denote "missing" in the GoldParse class, and so the TextCategorizer class treated the label set within gold.cats as complete. To fix this, we change GoldParse.cats to be a dict instead of a list. The GoldParse.cats dict should map to floats, with 1. denoting 'present' and 0. denoting 'absent'. Gradients are zeroed for categories absent from the gold.cats dict. A nice bonus is that you can also set values between 0 and 1 for partial membership. You can also set numeric values, if you're using a text classification model that uses an appropriate loss function. Unfortunately this is a breaking change; although the functionality was only recently introduced and hasn't been properly documented yet. I've updated the example script accordingly. 2017-10-05 23:43:02 +00:00			`cats = [{'POSITIVE': bool(y)} for y in labels]`
Clarify train textcat example 2017-07-29 19:59:27 +00:00
Add example for training text classifier 2017-07-22 18:15:32 +00:00			`split = int(len(train_data) * 0.8)`

			`train_texts = texts[:split]`
			`train_cats = cats[:split]`
			`dev_texts = texts[split:]`
			`dev_cats = cats[split:]`
			`return (train_texts, train_cats), (dev_texts, dev_cats)`


Finish text classifier example 2017-07-22 22:34:12 +00:00			`def main(model_loc=None):`
Add example for training text classifier 2017-07-22 18:15:32 +00:00			`nlp = spacy.lang.en.English()`
			`tokenizer = nlp.tokenizer`
			`textcat = TextCategorizer(tokenizer.vocab, labels=['POSITIVE'])`

			`print("Load IMDB data")`
Fix multi-label support for text classification The TextCategorizer class is supposed to support multi-label text classification, and allow training data to contain missing values. For this to work, the gradient of the loss should be 0 when labels are missing. Instead, there was no way to actually denote "missing" in the GoldParse class, and so the TextCategorizer class treated the label set within gold.cats as complete. To fix this, we change GoldParse.cats to be a dict instead of a list. The GoldParse.cats dict should map to floats, with 1. denoting 'present' and 0. denoting 'absent'. Gradients are zeroed for categories absent from the gold.cats dict. A nice bonus is that you can also set values between 0 and 1 for partial membership. You can also set numeric values, if you're using a text classification model that uses an appropriate loss function. Unfortunately this is a breaking change; although the functionality was only recently introduced and hasn't been properly documented yet. I've updated the example script accordingly. 2017-10-05 23:43:02 +00:00			`(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=2000)`
Finish text classifier example 2017-07-22 22:34:12 +00:00
Add example for training text classifier 2017-07-22 18:15:32 +00:00			`print("Itn.\tLoss\tP\tR\tF")`
			`progress = '{i:d} {loss:.3f} {textcat_p:.3f} {textcat_r:.3f} {textcat_f:.3f}'`

			`for i, (loss, scores) in enumerate(train_textcat(tokenizer, textcat,`
			`train_texts, train_cats,`
			`dev_texts, dev_cats, n_iter=20)):`
			`print(progress.format(i=i, loss=loss, **scores))`
Finish text classifier example 2017-07-22 22:34:12 +00:00			`# How to save, load and use`
			`nlp.pipeline.append(textcat)`
			`if model_loc is not None:`
			`nlp.to_disk(model_loc)`

			`nlp = spacy.load(model_loc)`
			`doc = nlp(u'This movie sucked!')`
			`print(doc.cats)`

Add example for training text classifier 2017-07-22 18:15:32 +00:00
			`if __name__ == '__main__':`
			`plac.call(main)`