From 72f7f4e68a5076a87dd9402812bfb72e479237ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Tue, 30 Nov 2021 11:58:59 +0100 Subject: [PATCH] morphologizer: avoid recreating label tuple for each token (#9764) * morphologizer: avoid recreating label tuple for each token The `labels` property converts the dictionary key set to a tuple. This property was used for every annotated token, recreating the tuple over and over again. Construct the tuple once in the set_annotations function and reuse it. On a Finnish pipeline that I was experimenting with, this results in a speedup of ~15% (~13000 -> ~15000 WPS). * tagger: avoid recreating label tuple for each token --- spacy/pipeline/morphologizer.pyx | 3 ++- spacy/pipeline/tagger.pyx | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index db425b69a..73d3799b1 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -231,12 +231,13 @@ class Morphologizer(Tagger): cdef Vocab vocab = self.vocab cdef bint overwrite = self.cfg["overwrite"] cdef bint extend = self.cfg["extend"] + labels = self.labels for i, doc in enumerate(docs): doc_tag_ids = batch_tag_ids[i] if hasattr(doc_tag_ids, "get"): doc_tag_ids = doc_tag_ids.get() for j, tag_id in enumerate(doc_tag_ids): - morph = self.labels[tag_id] + morph = labels[tag_id] # set morph if doc.c[j].morph == 0 or overwrite or extend: if overwrite and extend: diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index a9cbac37a..c0768dfec 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -166,13 +166,14 @@ class Tagger(TrainablePipe): cdef Doc doc cdef Vocab vocab = self.vocab cdef bint overwrite = self.cfg["overwrite"] + labels = self.labels for i, doc in enumerate(docs): doc_tag_ids = batch_tag_ids[i] if hasattr(doc_tag_ids, "get"): doc_tag_ids = doc_tag_ids.get() for j, tag_id in enumerate(doc_tag_ids): if doc.c[j].tag == 0 or overwrite: - doc.c[j].tag = self.vocab.strings[self.labels[tag_id]] + doc.c[j].tag = self.vocab.strings[labels[tag_id]] def update(self, examples, *, drop=0., sgd=None, losses=None): """Learn from a batch of documents and gold-standard information,