morphologizer: avoid recreating label tuple for each token (#9764)

* morphologizer: avoid recreating label tuple for each token

The `labels` property converts the dictionary key set to a tuple. This
property was used for every annotated token, recreating the tuple over
and over again.

Construct the tuple once in the set_annotations function and reuse it.

On a Finnish pipeline that I was experimenting with, this results in a
speedup of ~15% (~13000 -> ~15000 WPS).

* tagger: avoid recreating label tuple for each token
This commit is contained in:
Daniël de Kok 2021-11-30 11:58:59 +01:00 committed by GitHub
parent c19f0c1604
commit 72f7f4e68a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 4 additions and 2 deletions

View File

@ -231,12 +231,13 @@ class Morphologizer(Tagger):
cdef Vocab vocab = self.vocab
cdef bint overwrite = self.cfg["overwrite"]
cdef bint extend = self.cfg["extend"]
labels = self.labels
for i, doc in enumerate(docs):
doc_tag_ids = batch_tag_ids[i]
if hasattr(doc_tag_ids, "get"):
doc_tag_ids = doc_tag_ids.get()
for j, tag_id in enumerate(doc_tag_ids):
morph = self.labels[tag_id]
morph = labels[tag_id]
# set morph
if doc.c[j].morph == 0 or overwrite or extend:
if overwrite and extend:

View File

@ -166,13 +166,14 @@ class Tagger(TrainablePipe):
cdef Doc doc
cdef Vocab vocab = self.vocab
cdef bint overwrite = self.cfg["overwrite"]
labels = self.labels
for i, doc in enumerate(docs):
doc_tag_ids = batch_tag_ids[i]
if hasattr(doc_tag_ids, "get"):
doc_tag_ids = doc_tag_ids.get()
for j, tag_id in enumerate(doc_tag_ids):
if doc.c[j].tag == 0 or overwrite:
doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
doc.c[j].tag = self.vocab.strings[labels[tag_id]]
def update(self, examples, *, drop=0., sgd=None, losses=None):
"""Learn from a batch of documents and gold-standard information,