From 72f7f4e68a5076a87dd9402812bfb72e479237ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 30 Nov 2021 11:58:59 +0100
Subject: [PATCH] morphologizer: avoid recreating label tuple for each token
 (#9764)

* morphologizer: avoid recreating label tuple for each token

The `labels` property converts the dictionary key set to a tuple. This
property was used for every annotated token, recreating the tuple over
and over again.

Construct the tuple once in the set_annotations function and reuse it.

On a Finnish pipeline that I was experimenting with, this results in a
speedup of ~15% (~13000 -> ~15000 WPS).

* tagger: avoid recreating label tuple for each token
---
 spacy/pipeline/morphologizer.pyx | 3 ++-
 spacy/pipeline/tagger.pyx        | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index db425b69a..73d3799b1 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -231,12 +231,13 @@ class Morphologizer(Tagger):
         cdef Vocab vocab = self.vocab
         cdef bint overwrite = self.cfg["overwrite"]
         cdef bint extend = self.cfg["extend"]
+        labels = self.labels
         for i, doc in enumerate(docs):
             doc_tag_ids = batch_tag_ids[i]
             if hasattr(doc_tag_ids, "get"):
                 doc_tag_ids = doc_tag_ids.get()
             for j, tag_id in enumerate(doc_tag_ids):
-                morph = self.labels[tag_id]
+                morph = labels[tag_id]
                 # set morph
                 if doc.c[j].morph == 0 or overwrite or extend:
                     if overwrite and extend:
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index a9cbac37a..c0768dfec 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -166,13 +166,14 @@ class Tagger(TrainablePipe):
         cdef Doc doc
         cdef Vocab vocab = self.vocab
         cdef bint overwrite = self.cfg["overwrite"]
+        labels = self.labels
         for i, doc in enumerate(docs):
             doc_tag_ids = batch_tag_ids[i]
             if hasattr(doc_tag_ids, "get"):
                 doc_tag_ids = doc_tag_ids.get()
             for j, tag_id in enumerate(doc_tag_ids):
                 if doc.c[j].tag == 0 or overwrite:
-                    doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
+                    doc.c[j].tag = self.vocab.strings[labels[tag_id]]
 
     def update(self, examples, *, drop=0., sgd=None, losses=None):
         """Learn from a batch of documents and gold-standard information,