Warning goldparse (#4851)

* label in span not writable anymore

* Revert "label in span not writable anymore"

This reverts commit ab442338c8.

* provide more friendly error msg for parsing file
This commit is contained in:
Sofie Van Landeghem 2020-01-01 13:16:48 +01:00 committed by Ines Montani
parent 83e0a6f3e3
commit 581eeed98b
2 changed files with 17 additions and 8 deletions

View File

@ -534,6 +534,7 @@ class Errors(object):
"make sure the gold EL data refers to valid results of the " "make sure the gold EL data refers to valid results of the "
"named entity recognizer in the `nlp` pipeline.") "named entity recognizer in the `nlp` pipeline.")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
E996 = ("Could not parse {file}: {msg}")
E997 = ("Tokenizer special cases are not allowed to modify the text. " E997 = ("Tokenizer special cases are not allowed to modify the text. "
"This would map '{chunk}' to '{orth}' given token attributes " "This would map '{chunk}' to '{orth}' given token attributes "
"'{token_attrs}'.") "'{token_attrs}'.")

View File

@ -194,9 +194,10 @@ class GoldCorpus(object):
i = 0 i = 0
for loc in locs: for loc in locs:
loc = util.ensure_path(loc) loc = util.ensure_path(loc)
if loc.parts[-1].endswith("json"): file_name = loc.parts[-1]
if file_name.endswith("json"):
examples = read_json_file(loc) examples = read_json_file(loc)
elif loc.parts[-1].endswith("jsonl"): elif file_name.endswith("jsonl"):
gold_tuples = srsly.read_jsonl(loc) gold_tuples = srsly.read_jsonl(loc)
first_gold_tuple = next(gold_tuples) first_gold_tuple = next(gold_tuples)
gold_tuples = itertools.chain([first_gold_tuple], gold_tuples) gold_tuples = itertools.chain([first_gold_tuple], gold_tuples)
@ -212,17 +213,24 @@ class GoldCorpus(object):
doc = ex_dict.get("text", None) doc = ex_dict.get("text", None)
examples.append(Example.from_dict(ex_dict, doc=doc)) examples.append(Example.from_dict(ex_dict, doc=doc))
elif loc.parts[-1].endswith("msg"): elif file_name.endswith("msg"):
text, ex_dict = srsly.read_msgpack(loc) text, ex_dict = srsly.read_msgpack(loc)
examples = [Example.from_dict(ex_dict, doc=text)] examples = [Example.from_dict(ex_dict, doc=text)]
else: else:
supported = ("json", "jsonl", "msg") supported = ("json", "jsonl", "msg")
raise ValueError(Errors.E124.format(path=loc, formats=supported)) raise ValueError(Errors.E124.format(path=loc, formats=supported))
for example in examples: try:
yield example for example in examples:
i += 1 yield example
if limit and i >= limit: i += 1
return if limit and i >= limit:
return
except KeyError as e:
msg = "Missing key {}".format(e)
raise KeyError(Errors.E996.format(file=file_name, msg=msg))
except UnboundLocalError as e:
msg = "Unexpected document structure"
raise ValueError(Errors.E996.format(file=file_name, msg=msg))
@property @property
def dev_examples(self): def dev_examples(self):