mirror of https://github.com/explosion/spaCy.git
Provide debug data info for floret vectors (#10592)
This commit is contained in:
parent
36d3af3013
commit
e3ccc1973b
|
@ -19,6 +19,7 @@ from ..morphology import Morphology
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..util import registry, resolve_dot_names
|
from ..util import registry, resolve_dot_names
|
||||||
from ..compat import Literal
|
from ..compat import Literal
|
||||||
|
from ..vectors import Mode as VectorsMode
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -170,26 +171,34 @@ def debug_data(
|
||||||
show=verbose,
|
show=verbose,
|
||||||
)
|
)
|
||||||
if len(nlp.vocab.vectors):
|
if len(nlp.vocab.vectors):
|
||||||
msg.info(
|
if nlp.vocab.vectors.mode == VectorsMode.floret:
|
||||||
f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} "
|
msg.info(
|
||||||
f"unique keys, {nlp.vocab.vectors_length} dimensions)"
|
f"floret vectors with {len(nlp.vocab.vectors)} vectors, "
|
||||||
)
|
f"{nlp.vocab.vectors_length} dimensions, "
|
||||||
n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
|
f"{nlp.vocab.vectors.minn}-{nlp.vocab.vectors.maxn} char "
|
||||||
msg.warn(
|
f"n-gram subwords"
|
||||||
"{} words in training data without vectors ({:.0f}%)".format(
|
)
|
||||||
n_missing_vectors,
|
else:
|
||||||
100 * (n_missing_vectors / gold_train_data["n_words"]),
|
msg.info(
|
||||||
),
|
f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} "
|
||||||
)
|
f"unique keys, {nlp.vocab.vectors_length} dimensions)"
|
||||||
msg.text(
|
)
|
||||||
"10 most common words without vectors: {}".format(
|
n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
|
||||||
_format_labels(
|
msg.warn(
|
||||||
gold_train_data["words_missing_vectors"].most_common(10),
|
"{} words in training data without vectors ({:.0f}%)".format(
|
||||||
counts=True,
|
n_missing_vectors,
|
||||||
)
|
100 * (n_missing_vectors / gold_train_data["n_words"]),
|
||||||
),
|
),
|
||||||
show=verbose,
|
)
|
||||||
)
|
msg.text(
|
||||||
|
"10 most common words without vectors: {}".format(
|
||||||
|
_format_labels(
|
||||||
|
gold_train_data["words_missing_vectors"].most_common(10),
|
||||||
|
counts=True,
|
||||||
|
)
|
||||||
|
),
|
||||||
|
show=verbose,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
msg.info("No word vectors present in the package")
|
msg.info("No word vectors present in the package")
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue