Provide debug data info for floret vectors (#10592)

This commit is contained in:
Adriane Boyd 2022-03-31 15:11:32 +02:00 committed by GitHub
parent 36d3af3013
commit e3ccc1973b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 29 additions and 20 deletions

View File

@ -19,6 +19,7 @@ from ..morphology import Morphology
from ..language import Language from ..language import Language
from ..util import registry, resolve_dot_names from ..util import registry, resolve_dot_names
from ..compat import Literal from ..compat import Literal
from ..vectors import Mode as VectorsMode
from .. import util from .. import util
@ -170,26 +171,34 @@ def debug_data(
show=verbose, show=verbose,
) )
if len(nlp.vocab.vectors): if len(nlp.vocab.vectors):
msg.info( if nlp.vocab.vectors.mode == VectorsMode.floret:
f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} " msg.info(
f"unique keys, {nlp.vocab.vectors_length} dimensions)" f"floret vectors with {len(nlp.vocab.vectors)} vectors, "
) f"{nlp.vocab.vectors_length} dimensions, "
n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values()) f"{nlp.vocab.vectors.minn}-{nlp.vocab.vectors.maxn} char "
msg.warn( f"n-gram subwords"
"{} words in training data without vectors ({:.0f}%)".format( )
n_missing_vectors, else:
100 * (n_missing_vectors / gold_train_data["n_words"]), msg.info(
), f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} "
) f"unique keys, {nlp.vocab.vectors_length} dimensions)"
msg.text( )
"10 most common words without vectors: {}".format( n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
_format_labels( msg.warn(
gold_train_data["words_missing_vectors"].most_common(10), "{} words in training data without vectors ({:.0f}%)".format(
counts=True, n_missing_vectors,
) 100 * (n_missing_vectors / gold_train_data["n_words"]),
), ),
show=verbose, )
) msg.text(
"10 most common words without vectors: {}".format(
_format_labels(
gold_train_data["words_missing_vectors"].most_common(10),
counts=True,
)
),
show=verbose,
)
else: else:
msg.info("No word vectors present in the package") msg.info("No word vectors present in the package")