mirror of https://github.com/explosion/spaCy.git
Handle errors while multiprocessing (#8004)
* Handle errors while multiprocessing Handle errors while multiprocessing without hanging. * Return the traceback for errors raised while processing a batch, which can be handled by the top-level error handler * Allow for shortened batches due to custom error handlers that ignore errors and skip documents * Define custom components at a higher level * Also move up custom error handler * Use simpler component for test * Switch error type * Adjust test * Only call top-level error handler for exceptions * Register custom test components within tests Use global functions (so they can be pickled) but register the components only within the individual tests.
This commit is contained in:
parent
8a2602051c
commit
b120fb3511
|
@ -490,6 +490,7 @@ class Errors:
|
|||
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
|
||||
|
||||
# New errors added in v3.x
|
||||
E871 = ("Error encountered in nlp.pipe with multiprocessing:\n\n{error}")
|
||||
E872 = ("Unable to copy tokenizer from base model due to different "
|
||||
'tokenizer settings: current tokenizer config "{curr_config}" '
|
||||
'vs. base model "{base_config}"')
|
||||
|
|
|
@ -13,6 +13,7 @@ import srsly
|
|||
import multiprocessing as mp
|
||||
from itertools import chain, cycle
|
||||
from timeit import default_timer as timer
|
||||
import traceback
|
||||
|
||||
from .tokens.underscore import Underscore
|
||||
from .vocab import Vocab, create_vocab
|
||||
|
@ -1521,11 +1522,15 @@ class Language:
|
|||
|
||||
# Cycle channels not to break the order of docs.
|
||||
# The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable.
|
||||
byte_docs = chain.from_iterable(recv.recv() for recv in cycle(bytedocs_recv_ch))
|
||||
docs = (Doc(self.vocab).from_bytes(byte_doc) for byte_doc in byte_docs)
|
||||
byte_tuples = chain.from_iterable(recv.recv() for recv in cycle(bytedocs_recv_ch))
|
||||
try:
|
||||
for i, (_, doc) in enumerate(zip(raw_texts, docs), 1):
|
||||
yield doc
|
||||
for i, (_, (byte_doc, byte_error)) in enumerate(zip(raw_texts, byte_tuples), 1):
|
||||
if byte_doc is not None:
|
||||
doc = Doc(self.vocab).from_bytes(byte_doc)
|
||||
yield doc
|
||||
elif byte_error is not None:
|
||||
error = srsly.msgpack_loads(byte_error)
|
||||
self.default_error_handler(None, None, None, ValueError(Errors.E871.format(error=error)))
|
||||
if i % batch_size == 0:
|
||||
# tell `sender` that one batch was consumed.
|
||||
sender.step()
|
||||
|
@ -2019,12 +2024,19 @@ def _apply_pipes(
|
|||
"""
|
||||
Underscore.load_state(underscore_state)
|
||||
while True:
|
||||
texts = receiver.get()
|
||||
docs = (make_doc(text) for text in texts)
|
||||
for pipe in pipes:
|
||||
docs = pipe(docs)
|
||||
# Connection does not accept unpickable objects, so send list.
|
||||
sender.send([doc.to_bytes() for doc in docs])
|
||||
try:
|
||||
texts = receiver.get()
|
||||
docs = (make_doc(text) for text in texts)
|
||||
for pipe in pipes:
|
||||
docs = pipe(docs)
|
||||
# Connection does not accept unpickable objects, so send list.
|
||||
byte_docs = [(doc.to_bytes(), None) for doc in docs]
|
||||
padding = [(None, None)] * (len(texts) - len(byte_docs))
|
||||
sender.send(byte_docs + padding)
|
||||
except Exception:
|
||||
error_msg = [(None, srsly.msgpack_dumps(traceback.format_exc()))]
|
||||
padding = [(None, None)] * (len(texts) - 1)
|
||||
sender.send(error_msg + padding)
|
||||
|
||||
|
||||
class _Sender:
|
||||
|
|
|
@ -8,13 +8,36 @@ from spacy.vocab import Vocab
|
|||
from spacy.training import Example
|
||||
from spacy.lang.en import English
|
||||
from spacy.lang.de import German
|
||||
from spacy.util import registry, ignore_error, raise_error
|
||||
from spacy.util import registry, ignore_error, raise_error, logger
|
||||
import spacy
|
||||
from thinc.api import NumpyOps, get_current_ops
|
||||
|
||||
from .util import add_vecs_to_vocab, assert_docs_equal
|
||||
|
||||
|
||||
def evil_component(doc):
|
||||
if "2" in doc.text:
|
||||
raise ValueError("no dice")
|
||||
return doc
|
||||
|
||||
|
||||
def perhaps_set_sentences(doc):
|
||||
if not doc.text.startswith("4"):
|
||||
doc[-1].is_sent_start = True
|
||||
return doc
|
||||
|
||||
|
||||
def assert_sents_error(doc):
|
||||
if not doc.has_annotation("SENT_START"):
|
||||
raise ValueError("no sents")
|
||||
return doc
|
||||
|
||||
|
||||
def warn_error(proc_name, proc, docs, e):
|
||||
logger = logging.getLogger("spacy")
|
||||
logger.warning(f"Trouble with component {proc_name}.")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def nlp():
|
||||
nlp = Language(Vocab())
|
||||
|
@ -93,19 +116,16 @@ def test_evaluate_no_pipe(nlp):
|
|||
nlp.evaluate([Example.from_dict(doc, annots)])
|
||||
|
||||
|
||||
@Language.component("test_language_vector_modification_pipe")
|
||||
def vector_modification_pipe(doc):
|
||||
doc.vector += 1
|
||||
return doc
|
||||
|
||||
|
||||
@Language.component("test_language_userdata_pipe")
|
||||
def userdata_pipe(doc):
|
||||
doc.user_data["foo"] = "bar"
|
||||
return doc
|
||||
|
||||
|
||||
@Language.component("test_language_ner_pipe")
|
||||
def ner_pipe(doc):
|
||||
span = Span(doc, 0, 1, label="FIRST")
|
||||
doc.ents += (span,)
|
||||
|
@ -123,6 +143,9 @@ def sample_vectors():
|
|||
|
||||
@pytest.fixture
|
||||
def nlp2(nlp, sample_vectors):
|
||||
Language.component("test_language_vector_modification_pipe", func=vector_modification_pipe)
|
||||
Language.component("test_language_userdata_pipe", func=userdata_pipe)
|
||||
Language.component("test_language_ner_pipe", func=ner_pipe)
|
||||
add_vecs_to_vocab(nlp.vocab, sample_vectors)
|
||||
nlp.add_pipe("test_language_vector_modification_pipe")
|
||||
nlp.add_pipe("test_language_ner_pipe")
|
||||
|
@ -168,82 +191,115 @@ def test_language_pipe_stream(nlp2, n_process, texts):
|
|||
assert_docs_equal(doc, expected_doc)
|
||||
|
||||
|
||||
def test_language_pipe_error_handler():
|
||||
@pytest.mark.parametrize("n_process", [1, 2])
|
||||
def test_language_pipe_error_handler(n_process):
|
||||
"""Test that the error handling of nlp.pipe works well"""
|
||||
nlp = English()
|
||||
nlp.add_pipe("merge_subtokens")
|
||||
nlp.initialize()
|
||||
texts = ["Curious to see what will happen to this text.", "And this one."]
|
||||
# the pipeline fails because there's no parser
|
||||
with pytest.raises(ValueError):
|
||||
ops = get_current_ops()
|
||||
if isinstance(ops, NumpyOps) or n_process < 2:
|
||||
nlp = English()
|
||||
nlp.add_pipe("merge_subtokens")
|
||||
nlp.initialize()
|
||||
texts = ["Curious to see what will happen to this text.", "And this one."]
|
||||
# the pipeline fails because there's no parser
|
||||
with pytest.raises(ValueError):
|
||||
nlp(texts[0])
|
||||
with pytest.raises(ValueError):
|
||||
list(nlp.pipe(texts, n_process=n_process))
|
||||
nlp.set_error_handler(raise_error)
|
||||
with pytest.raises(ValueError):
|
||||
list(nlp.pipe(texts, n_process=n_process))
|
||||
# set explicitely to ignoring
|
||||
nlp.set_error_handler(ignore_error)
|
||||
docs = list(nlp.pipe(texts, n_process=n_process))
|
||||
assert len(docs) == 0
|
||||
nlp(texts[0])
|
||||
with pytest.raises(ValueError):
|
||||
list(nlp.pipe(texts))
|
||||
nlp.set_error_handler(raise_error)
|
||||
with pytest.raises(ValueError):
|
||||
list(nlp.pipe(texts))
|
||||
# set explicitely to ignoring
|
||||
nlp.set_error_handler(ignore_error)
|
||||
docs = list(nlp.pipe(texts))
|
||||
assert len(docs) == 0
|
||||
nlp(texts[0])
|
||||
|
||||
|
||||
def test_language_pipe_error_handler_custom(en_vocab):
|
||||
@pytest.mark.parametrize("n_process", [1, 2])
|
||||
def test_language_pipe_error_handler_custom(en_vocab, n_process):
|
||||
"""Test the error handling of a custom component that has no pipe method"""
|
||||
Language.component("my_evil_component", func=evil_component)
|
||||
ops = get_current_ops()
|
||||
if isinstance(ops, NumpyOps) or n_process < 2:
|
||||
nlp = English()
|
||||
nlp.add_pipe("my_evil_component")
|
||||
texts = ["TEXT 111", "TEXT 222", "TEXT 333", "TEXT 342", "TEXT 666"]
|
||||
with pytest.raises(ValueError):
|
||||
# the evil custom component throws an error
|
||||
list(nlp.pipe(texts))
|
||||
|
||||
@Language.component("my_evil_component")
|
||||
def evil_component(doc):
|
||||
if "2" in doc.text:
|
||||
raise ValueError("no dice")
|
||||
return doc
|
||||
|
||||
def warn_error(proc_name, proc, docs, e):
|
||||
from spacy.util import logger
|
||||
|
||||
logger.warning(f"Trouble with component {proc_name}.")
|
||||
|
||||
nlp = English()
|
||||
nlp.add_pipe("my_evil_component")
|
||||
nlp.initialize()
|
||||
texts = ["TEXT 111", "TEXT 222", "TEXT 333", "TEXT 342", "TEXT 666"]
|
||||
with pytest.raises(ValueError):
|
||||
# the evil custom component throws an error
|
||||
list(nlp.pipe(texts))
|
||||
|
||||
nlp.set_error_handler(warn_error)
|
||||
logger = logging.getLogger("spacy")
|
||||
with mock.patch.object(logger, "warning") as mock_warning:
|
||||
# the errors by the evil custom component raise a warning for each bad batch
|
||||
docs = list(nlp.pipe(texts))
|
||||
mock_warning.assert_called()
|
||||
assert mock_warning.call_count == 2
|
||||
assert len(docs) + mock_warning.call_count == len(texts)
|
||||
assert [doc.text for doc in docs] == ["TEXT 111", "TEXT 333", "TEXT 666"]
|
||||
nlp.set_error_handler(warn_error)
|
||||
logger = logging.getLogger("spacy")
|
||||
with mock.patch.object(logger, "warning") as mock_warning:
|
||||
# the errors by the evil custom component raise a warning for each
|
||||
# bad doc
|
||||
docs = list(nlp.pipe(texts, n_process=n_process))
|
||||
# HACK/TODO? the warnings in child processes don't seem to be
|
||||
# detected by the mock logger
|
||||
if n_process == 1:
|
||||
mock_warning.assert_called()
|
||||
assert mock_warning.call_count == 2
|
||||
assert len(docs) + mock_warning.call_count == len(texts)
|
||||
assert [doc.text for doc in docs] == ["TEXT 111", "TEXT 333", "TEXT 666"]
|
||||
|
||||
|
||||
def test_language_pipe_error_handler_pipe(en_vocab):
|
||||
@pytest.mark.parametrize("n_process", [1, 2])
|
||||
def test_language_pipe_error_handler_pipe(en_vocab, n_process):
|
||||
"""Test the error handling of a component's pipe method"""
|
||||
Language.component("my_perhaps_sentences", func=perhaps_set_sentences)
|
||||
Language.component("assert_sents_error", func=assert_sents_error)
|
||||
ops = get_current_ops()
|
||||
if isinstance(ops, NumpyOps) or n_process < 2:
|
||||
texts = [f"{str(i)} is enough. Done" for i in range(100)]
|
||||
nlp = English()
|
||||
nlp.add_pipe("my_perhaps_sentences")
|
||||
nlp.add_pipe("assert_sents_error")
|
||||
nlp.initialize()
|
||||
with pytest.raises(ValueError):
|
||||
# assert_sents_error requires sentence boundaries, will throw an error otherwise
|
||||
docs = list(nlp.pipe(texts, n_process=n_process, batch_size=10))
|
||||
nlp.set_error_handler(ignore_error)
|
||||
docs = list(nlp.pipe(texts, n_process=n_process, batch_size=10))
|
||||
# we lose/ignore the failing 4,40-49 docs
|
||||
assert len(docs) == 89
|
||||
|
||||
@Language.component("my_sentences")
|
||||
def perhaps_set_sentences(doc):
|
||||
if not doc.text.startswith("4"):
|
||||
doc[-1].is_sent_start = True
|
||||
return doc
|
||||
|
||||
texts = [f"{str(i)} is enough. Done" for i in range(100)]
|
||||
nlp = English()
|
||||
nlp.add_pipe("my_sentences")
|
||||
entity_linker = nlp.add_pipe("entity_linker", config={"entity_vector_length": 3})
|
||||
entity_linker.kb.add_entity(entity="Q1", freq=12, entity_vector=[1, 2, 3])
|
||||
nlp.initialize()
|
||||
with pytest.raises(ValueError):
|
||||
# the entity linker requires sentence boundaries, will throw an error otherwise
|
||||
docs = list(nlp.pipe(texts, batch_size=10))
|
||||
nlp.set_error_handler(ignore_error)
|
||||
docs = list(nlp.pipe(texts, batch_size=10))
|
||||
# we lose/ignore the failing 0-9 and 40-49 batches
|
||||
assert len(docs) == 80
|
||||
@pytest.mark.parametrize("n_process", [1, 2])
|
||||
def test_language_pipe_error_handler_make_doc_actual(n_process):
|
||||
"""Test the error handling for make_doc"""
|
||||
# TODO: fix so that the following test is the actual behavior
|
||||
|
||||
ops = get_current_ops()
|
||||
if isinstance(ops, NumpyOps) or n_process < 2:
|
||||
nlp = English()
|
||||
nlp.max_length = 10
|
||||
texts = ["12345678901234567890", "12345"] * 10
|
||||
with pytest.raises(ValueError):
|
||||
list(nlp.pipe(texts, n_process=n_process))
|
||||
nlp.default_error_handler = ignore_error
|
||||
if n_process == 1:
|
||||
with pytest.raises(ValueError):
|
||||
list(nlp.pipe(texts, n_process=n_process))
|
||||
else:
|
||||
docs = list(nlp.pipe(texts, n_process=n_process))
|
||||
assert len(docs) == 0
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize("n_process", [1, 2])
|
||||
def test_language_pipe_error_handler_make_doc_preferred(n_process):
|
||||
"""Test the error handling for make_doc"""
|
||||
|
||||
ops = get_current_ops()
|
||||
if isinstance(ops, NumpyOps) or n_process < 2:
|
||||
nlp = English()
|
||||
nlp.max_length = 10
|
||||
texts = ["12345678901234567890", "12345"] * 10
|
||||
with pytest.raises(ValueError):
|
||||
list(nlp.pipe(texts, n_process=n_process))
|
||||
nlp.default_error_handler = ignore_error
|
||||
docs = list(nlp.pipe(texts, n_process=n_process))
|
||||
assert len(docs) == 0
|
||||
|
||||
|
||||
def test_language_from_config_before_after_init():
|
||||
|
|
Loading…
Reference in New Issue