From 2e12dec76e8518ead06def2ab04f0e893957fa13 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 24 Mar 2015 04:26:37 +0100 Subject: [PATCH] * Adjust scorer to account for tokenization mistakes --- spacy/scorer.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/spacy/scorer.py b/spacy/scorer.py index 686584cbb..2b03d30a0 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -9,11 +9,13 @@ class Scorer(object): self.ents_fp = 0 self.ents_fn = 0 self.total = 1e-100 + self.mistokened = 0 + self.n_tokens = 0 self.eval_punct = eval_punct @property def tags_acc(self): - return (self.tags_corr / self.total) * 100 + return ((self.tags_corr - self.mistokened) / (self.n_tokens - self.mistokened)) * 100 @property def uas(self): @@ -39,12 +41,15 @@ class Scorer(object): assert len(tokens) == len(gold) for i, token in enumerate(tokens): + if gold.orths.get(token.idx) != token.orth_: + self.mistokened += 1 if not self.skip_token(i, token, gold): self.total += 1 if token.head.i == gold.heads[i]: self.heads_corr += 1 self.labels_corr += token.dep_ == gold.labels[i] - self.tags_corr += token.tag_ == gold.tags[i] + self.tags_corr += token.tag_ == gold.tags[i] + self.n_tokens += 1 gold_ents = set((start, end, label) for (start, end, label) in gold.ents) guess_ents = set(tokens.ents) if verbose and gold_ents: