diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py index 6018827a4..4d07ed26a 100644 --- a/examples/training/train_textcat.py +++ b/examples/training/train_textcat.py @@ -21,7 +21,6 @@ import thinc.neural._classes.layernorm thinc.neural._classes.layernorm.set_compat_six_eight(False) - def train_textcat(tokenizer, textcat, train_texts, train_cats, dev_texts, dev_cats, n_iter=20): @@ -57,18 +56,20 @@ def evaluate(tokenizer, textcat, texts, cats): for i, doc in enumerate(textcat.pipe(docs)): gold = cats[i] for label, score in doc.cats.items(): - if score >= 0.5 and label in gold: + if label not in gold: + continue + if score >= 0.5 and gold[label] >= 0.5: tp += 1. - elif score >= 0.5 and label not in gold: + elif score >= 0.5 and gold[label] < 0.5: fp += 1. - elif score < 0.5 and label not in gold: + elif score < 0.5 and gold[label] < 0.5: tn += 1 - if score < 0.5 and label in gold: + elif score < 0.5 and gold[label] >= 0.5: fn += 1 precis = tp / (tp + fp) recall = tp / (tp + fn) fscore = 2 * (precis * recall) / (precis + recall) - return {'textcat_p': precis, 'textcat_r': recall, 'textcat_f': fscore} + return {'textcat_p': precis, 'textcat_r': recall, 'textcat_f': fscore} def load_data(limit=0): @@ -80,7 +81,7 @@ def load_data(limit=0): train_data = train_data[-limit:] texts, labels = zip(*train_data) - cats = [(['POSITIVE'] if y else []) for y in labels] + cats = [{'POSITIVE': bool(y)} for y in labels] split = int(len(train_data) * 0.8) @@ -97,7 +98,7 @@ def main(model_loc=None): textcat = TextCategorizer(tokenizer.vocab, labels=['POSITIVE']) print("Load IMDB data") - (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=1000) + (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=2000) print("Itn.\tLoss\tP\tR\tF") progress = '{i:d} {loss:.3f} {textcat_p:.3f} {textcat_r:.3f} {textcat_f:.3f}' diff --git a/spacy/gold.pyx b/spacy/gold.pyx index fc8d6622b..2512c179f 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -387,7 +387,7 @@ cdef class GoldParse: def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None, deps=None, entities=None, make_projective=False, - cats=tuple()): + cats=None): """Create a GoldParse. doc (Doc): The document the annotations refer to. @@ -398,12 +398,15 @@ cdef class GoldParse: entities (iterable): A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. - cats (iterable): A sequence of labels for text classification. Each - label may be a string or an int, or a `(start_char, end_char, label)` + cats (dict): Labels for text classification. Each key in the dictionary + may be a string or an int, or a `(start_char, end_char, label)` tuple, indicating that the label is applied to only part of the document (usually a sentence). Unlike entity annotations, label annotations can overlap, i.e. a single word can be covered by - multiple labelled spans. + multiple labelled spans. The TextCategorizer component expects + true examples of a label to have the value 1.0, and negative examples + of a label to have the value 0.0. Labels not in the dictionary are + treated as missing -- the gradient for those labels will be zero. RETURNS (GoldParse): The newly constructed object. """ if words is None: @@ -434,7 +437,7 @@ cdef class GoldParse: self.c.sent_start = self.mem.alloc(len(doc), sizeof(int)) self.c.ner = self.mem.alloc(len(doc), sizeof(Transition)) - self.cats = list(cats) + self.cats = {} if cats is None else dict(cats) self.words = [None] * len(doc) self.tags = [None] * len(doc) self.heads = [None] * len(doc) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 8d935335c..c39976630 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -551,7 +551,6 @@ class NeuralLabeller(NeuralTagger): label = self.make_label(i, words, tags, heads, deps, ents) if label is not None and label not in self.labels: self.labels[label] = len(self.labels) - print(len(self.labels)) if self.model is True: token_vector_width = util.env_opt('token_vector_width') self.model = chain( @@ -720,11 +719,17 @@ class TextCategorizer(BaseThincComponent): def get_loss(self, docs, golds, scores): truths = numpy.zeros((len(golds), len(self.labels)), dtype='f') + not_missing = numpy.ones((len(golds), len(self.labels)), dtype='f') for i, gold in enumerate(golds): for j, label in enumerate(self.labels): - truths[i, j] = label in gold.cats + if label in gold.cats: + truths[i, j] = gold.cats[label] + else: + not_missing[i, j] = 0. truths = self.model.ops.asarray(truths) + not_missing = self.model.ops.asarray(not_missing) d_scores = (scores-truths) / scores.shape[0] + d_scores *= not_missing mean_square_error = ((scores-truths)**2).sum(axis=1).mean() return mean_square_error, d_scores