spaCy/spacy/tests/util.py

import contextlib
import re
import tempfile

import numpy
import srsly
from thinc.api import get_current_ops

from spacy.tokens import Doc
from spacy.training import split_bilu_label
from spacy.util import make_tempdir  # noqa: F401
from spacy.vocab import Vocab


@contextlib.contextmanager
def make_tempfile(mode="r"):
    f = tempfile.TemporaryFile(mode=mode)
    yield f
    f.close()


def get_batch(batch_size):
    vocab = Vocab()
    docs = []
    start = 0
    for size in range(1, batch_size + 1):
        # Make the words numbers, so that they're distinct
        # across the batch, and easy to track.
        numbers = [str(i) for i in range(start, start + size)]
        docs.append(Doc(vocab, words=numbers))
        start += size
    return docs


def get_random_doc(n_words):
    vocab = Vocab()
    # Make the words numbers, so that they're easy to track.
    numbers = [str(i) for i in range(0, n_words)]
    return Doc(vocab, words=numbers)


def apply_transition_sequence(parser, doc, sequence):
    """Perform a series of pre-specified transitions, to put the parser in a
    desired state."""
    for action_name in sequence:
        if "-" in action_name:
            move, label = split_bilu_label(action_name)
            parser.add_label(label)
    with parser.step_through(doc) as stepwise:
        for transition in sequence:
            stepwise.transition(transition)


def add_vecs_to_vocab(vocab, vectors):
    """Add list of vector tuples to given vocab. All vectors need to have the
    same length. Format: [("text", [1, 2, 3])]"""
    length = len(vectors[0][1])
    vocab.reset_vectors(width=length)
    for word, vec in vectors:
        vocab.set_vector(word, vector=vec)
    return vocab


def get_cosine(vec1, vec2):
    """Get cosine for two given vectors"""
    OPS = get_current_ops()
    v1 = OPS.to_numpy(OPS.asarray(vec1))
    v2 = OPS.to_numpy(OPS.asarray(vec2))
    return numpy.dot(v1, v2) / (numpy.linalg.norm(v1) * numpy.linalg.norm(v2))


def assert_docs_equal(doc1, doc2):
    """Compare two Doc objects and assert that they're equal. Tests for tokens,
    tags, dependencies and entities."""
    assert [t.orth for t in doc1] == [t.orth for t in doc2]

    assert [t.pos for t in doc1] == [t.pos for t in doc2]
    assert [t.tag for t in doc1] == [t.tag for t in doc2]

    assert [t.head.i for t in doc1] == [t.head.i for t in doc2]
    assert [t.dep for t in doc1] == [t.dep for t in doc2]
    assert [t.is_sent_start for t in doc1] == [t.is_sent_start for t in doc2]

    assert [t.ent_type for t in doc1] == [t.ent_type for t in doc2]
    assert [t.ent_iob for t in doc1] == [t.ent_iob for t in doc2]
    for ent1, ent2 in zip(doc1.ents, doc2.ents):
        assert ent1.start == ent2.start
        assert ent1.end == ent2.end
        assert ent1.label == ent2.label
        assert ent1.kb_id == ent2.kb_id


def assert_packed_msg_equal(b1, b2):
    """Assert that two packed msgpack messages are equal."""
    msg1 = srsly.msgpack_loads(b1)
    msg2 = srsly.msgpack_loads(b2)
    assert sorted(msg1.keys()) == sorted(msg2.keys())
    for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
        assert k1 == k2
        assert v1 == v2


def normalize_whitespace(s):
    return re.sub(r"\s+", " ", s)
Add test utils for temp file and temp dir 2017-06-02 08:56:09 +00:00			`import contextlib`
Normalize whitespace in evaluate CLI output test (#12157) * Normalize whitespace in evaluate CLI output test Depending on terminal settings, lines may be padded to the screen width so the comparison is too strict with only the command string replacement. * Move to test util method * Change to normalization method 2023-01-27 15:13:34 +00:00			`import re`
Configure isort to use the Black profile, recursively isort the `spacy` module (#12721) * Use isort with Black profile * isort all the things * Fix import cycles as a result of import sorting * Add DOCBIN_ALL_ATTRS type definition * Add isort to requirements * Remove isort from build dependencies check * Typo 2023-06-14 15:48:41 +00:00			`import tempfile`

			`import numpy`
💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003) Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉 See here: https://github.com/explosion/srsly Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place. At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel. srsly currently includes forks of the following packages: ujson msgpack msgpack-numpy cloudpickle * WIP: replace json/ujson with srsly * Replace ujson in examples Use regular json instead of srsly to make code easier to read and follow * Update requirements * Fix imports * Fix typos * Replace msgpack with srsly * Fix warning 2018-12-03 00:28:22 +00:00			`import srsly`
Configure isort to use the Black profile, recursively isort the `spacy` module (#12721) * Use isort with Black profile * isort all the things * Fix import cycles as a result of import sorting * Add DOCBIN_ALL_ATTRS type definition * Add isort to requirements * Remove isort from build dependencies check * Typo 2023-06-14 15:48:41 +00:00			`from thinc.api import get_current_ops`

Tidy up tests and docs 2020-09-21 18:43:54 +00:00			`from spacy.tokens import Doc`
account for NER labels with a hyphen in the name (#10960) * account for NER labels with a hyphen in the name * cleanup * fix docstring * add return type to helper method * shorter method and few more occurrences * user helper method across repo * fix circular import * partial revert to avoid circular import 2022-06-17 19:02:37 +00:00			`from spacy.training import split_bilu_label`
Configure isort to use the Black profile, recursively isort the `spacy` module (#12721) * Use isort with Black profile * isort all the things * Fix import cycles as a result of import sorting * Add DOCBIN_ALL_ATTRS type definition * Add isort to requirements * Remove isort from build dependencies check * Typo 2023-06-14 15:48:41 +00:00			`from spacy.util import make_tempdir # noqa: F401`
			`from spacy.vocab import Vocab`
Train textcat with config (#5143) * bring back default build_text_classifier method * remove _set_dims_ hack in favor of proper dim inference * add tok2vec initialize to unit test * small fixes * add unit test for various textcat config settings * logistic output layer does not have nO * fix window_size setting * proper fix * fix W initialization * Update textcat training example * Use ml_datasets * Convert training data to `Example` format * Use `n_texts` to set proportionate dev size * fix _init renaming on latest thinc * avoid setting a non-existing dim * update to thinc==8.0.0a2 * add BOW and CNN defaults for easy testing * various experiments with train_textcat script, fix softmax activation in textcat bow * allow textcat train script to work on other datasets as well * have dataset as a parameter * train textcat from config, with example config * add config for training textcat * formatting * fix exclusive_classes * fixing BOW for GPU * bump thinc to 8.0.0a3 (not published yet so CI will fail) * add in link_vectors_to_models which got deleted Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> 2020-03-29 17:40:36 +00:00
Add load_test_model function with importorskip() Loads model only if it can be imported, i.e. if it's installed as a package. 2017-05-29 20:11:31 +00:00
Add test utils for temp file and temp dir 2017-06-02 08:56:09 +00:00			`@contextlib.contextmanager`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 00:09:36 +00:00			`def make_tempfile(mode="r"):`
Add test utils for temp file and temp dir 2017-06-02 08:56:09 +00:00			`f = tempfile.TemporaryFile(mode=mode)`
			`yield f`
			`f.close()`


Train textcat with config (#5143) * bring back default build_text_classifier method * remove _set_dims_ hack in favor of proper dim inference * add tok2vec initialize to unit test * small fixes * add unit test for various textcat config settings * logistic output layer does not have nO * fix window_size setting * proper fix * fix W initialization * Update textcat training example * Use ml_datasets * Convert training data to `Example` format * Use `n_texts` to set proportionate dev size * fix _init renaming on latest thinc * avoid setting a non-existing dim * update to thinc==8.0.0a2 * add BOW and CNN defaults for easy testing * various experiments with train_textcat script, fix softmax activation in textcat bow * allow textcat train script to work on other datasets as well * have dataset as a parameter * train textcat from config, with example config * add config for training textcat * formatting * fix exclusive_classes * fixing BOW for GPU * bump thinc to 8.0.0a3 (not published yet so CI will fail) * add in link_vectors_to_models which got deleted Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> 2020-03-29 17:40:36 +00:00			`def get_batch(batch_size):`
			`vocab = Vocab()`
			`docs = []`
			`start = 0`
			`for size in range(1, batch_size + 1):`
			`# Make the words numbers, so that they're distinct`
			`# across the batch, and easy to track.`
			`numbers = [str(i) for i in range(start, start + size)]`
			`docs.append(Doc(vocab, words=numbers))`
			`start += size`
			`return docs`


fix name clash 2020-06-02 20:24:57 +00:00			`def get_random_doc(n_words):`
add test for minibatch util 2020-06-02 16:26:21 +00:00			`vocab = Vocab()`
			`# Make the words numbers, so that they're easy to track.`
			`numbers = [str(i) for i in range(0, n_words)]`
			`return Doc(vocab, words=numbers)`


Add apply_transition_sequence util function to utils 2017-01-11 20:30:14 +00:00			`def apply_transition_sequence(parser, doc, sequence):`
			`"""Perform a series of pre-specified transitions, to put the parser in a`
			`desired state."""`
			`for action_name in sequence:`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 00:09:36 +00:00			`if "-" in action_name:`
account for NER labels with a hyphen in the name (#10960) * account for NER labels with a hyphen in the name * cleanup * fix docstring * add return type to helper method * shorter method and few more occurrences * user helper method across repo * fix circular import * partial revert to avoid circular import 2022-06-17 19:02:37 +00:00			`move, label = split_bilu_label(action_name)`
Add apply_transition_sequence util function to utils 2017-01-11 20:30:14 +00:00			`parser.add_label(label)`
			`with parser.step_through(doc) as stepwise:`
			`for transition in sequence:`
			`stepwise.transition(transition)`
Add get_cosine util function 2017-01-12 15:49:57 +00:00

Add util function to add vectors to vocab 2017-01-13 13:26:30 +00:00			`def add_vecs_to_vocab(vocab, vectors):`
			`"""Add list of vector tuples to given vocab. All vectors need to have the`
			`same length. Format: [("text", [1, 2, 3])]"""`
			`length = len(vectors[0][1])`
Revise and simplify Vectors class 2017-10-31 17:25:08 +00:00			`vocab.reset_vectors(width=length)`
Add util function to add vectors to vocab 2017-01-13 13:26:30 +00:00			`for word, vec in vectors:`
Revise and simplify Vectors class 2017-10-31 17:25:08 +00:00			`vocab.set_vector(word, vector=vec)`
Add util function to add vectors to vocab 2017-01-13 13:26:30 +00:00			`return vocab`


Add get_cosine util function 2017-01-12 15:49:57 +00:00			`def get_cosine(vec1, vec2):`
			`"""Get cosine for two given vectors"""`
Set up GPU CI testing (#7293) * Set up CI for tests with GPU agent * Update tests for enabled GPU * Fix steps filename * Add parallel build jobs as a setting * Fix test requirements * Fix install test requirements condition * Fix pipeline models test * Reset current ops in prefer/require testing * Fix more tests * Remove separate test_models test * Fix regression 5551 * fix StaticVectors for GPU use * fix vocab tests * Fix regression test 5082 * Move azure steps to .github and reenable default pool jobs * Consolidate/rename azure steps Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> 2021-04-22 12:58:29 +00:00			`OPS = get_current_ops()`
			`v1 = OPS.to_numpy(OPS.asarray(vec1))`
			`v2 = OPS.to_numpy(OPS.asarray(vec2))`
			`return numpy.dot(v1, v2) / (numpy.linalg.norm(v1) * numpy.linalg.norm(v2))`
Add assert_docs_equal util to compare two docs 2017-01-12 20:56:52 +00:00

			`def assert_docs_equal(doc1, doc2):`
Reformat add_docs_equal and add docstring 2017-01-13 13:25:53 +00:00			`"""Compare two Doc objects and assert that they're equal. Tests for tokens,`
			`tags, dependencies and entities."""`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 00:09:36 +00:00			`assert [t.orth for t in doc1] == [t.orth for t in doc2]`
Add assert_docs_equal util to compare two docs 2017-01-12 20:56:52 +00:00
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 00:09:36 +00:00			`assert [t.pos for t in doc1] == [t.pos for t in doc2]`
			`assert [t.tag for t in doc1] == [t.tag for t in doc2]`
Add assert_docs_equal util to compare two docs 2017-01-12 20:56:52 +00:00
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 00:09:36 +00:00			`assert [t.head.i for t in doc1] == [t.head.i for t in doc2]`
			`assert [t.dep for t in doc1] == [t.dep for t in doc2]`
Fix sents comparison in test util Due to changes to `Span` (#5005), spans from different documents are now never equal. Check `Token.is_sent_start` values instead. 2020-03-13 08:25:23 +00:00			`assert [t.is_sent_start for t in doc1] == [t.is_sent_start for t in doc2]`
Add assert_docs_equal util to compare two docs 2017-01-12 20:56:52 +00:00
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 00:09:36 +00:00			`assert [t.ent_type for t in doc1] == [t.ent_type for t in doc2]`
			`assert [t.ent_iob for t in doc1] == [t.ent_iob for t in doc2]`
Sync Span __eq__ and __hash__ (#5005) * Sync Span __eq__ and __hash__ Use the same tuple for `__eq__` and `__hash__`, including all attributes except `vector` and `vector_norm`. * Update entity comparison in tests Update `assert_docs_equal()` test util to compare `Span` properties for ents rather than `Span` objects. 2020-02-16 16:20:36 +00:00			`for ent1, ent2 in zip(doc1.ents, doc2.ents):`
			`assert ent1.start == ent2.start`
			`assert ent1.end == ent2.end`
			`assert ent1.label == ent2.label`
			`assert ent1.kb_id == ent2.kb_id`
Add assert_packed_msg_equal util function 2017-06-03 15:04:30 +00:00

			`def assert_packed_msg_equal(b1, b2):`
			`"""Assert that two packed msgpack messages are equal."""`
💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003) Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉 See here: https://github.com/explosion/srsly Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place. At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel. srsly currently includes forks of the following packages: ujson msgpack msgpack-numpy cloudpickle * WIP: replace json/ujson with srsly * Replace ujson in examples Use regular json instead of srsly to make code easier to read and follow * Update requirements * Fix imports * Fix typos * Replace msgpack with srsly * Fix warning 2018-12-03 00:28:22 +00:00			`msg1 = srsly.msgpack_loads(b1)`
			`msg2 = srsly.msgpack_loads(b2)`
Add assert_packed_msg_equal util function 2017-06-03 15:04:30 +00:00			`assert sorted(msg1.keys()) == sorted(msg2.keys())`
			`for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):`
			`assert k1 == k2`
			`assert v1 == v2`
Normalize whitespace in evaluate CLI output test (#12157) * Normalize whitespace in evaluate CLI output test Depending on terminal settings, lines may be padded to the screen width so the comparison is too strict with only the command string replacement. * Move to test util method * Change to normalization method 2023-01-27 15:13:34 +00:00

			`def normalize_whitespace(s):`
			`return re.sub(r"\s+", " ", s)`