mirror of https://github.com/explosion/spaCy.git
Add NER map option to convert CLI (#4763)
Instead of a hard-coded NER tag simplification function that was only intended for NorNE, map NER tags in CoNLL-U converter using a dict provided as JSON as a command-line option. Map NER entity types or new tag or to "" for 'O', e.g.: ``` {"PER": "PERSON", "BAD": ""} => B-PER -> B-PERSON B-BAD -> O ```
This commit is contained in:
parent
68f711b409
commit
eb9b1858c4
|
@ -39,6 +39,7 @@ FILE_TYPES_STDOUT = ("json", "jsonl")
|
|||
converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str),
|
||||
lang=("Language (if tokenizer required)", "option", "l", str),
|
||||
morphology=("Enable appending morphology to tags", "flag", "m", bool),
|
||||
ner_map_path=("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path),
|
||||
)
|
||||
def convert(
|
||||
input_file,
|
||||
|
@ -49,6 +50,7 @@ def convert(
|
|||
model=None,
|
||||
morphology=False,
|
||||
converter="auto",
|
||||
ner_map_path=None,
|
||||
lang=None,
|
||||
):
|
||||
"""
|
||||
|
@ -94,6 +96,9 @@ def convert(
|
|||
)
|
||||
if converter not in CONVERTERS:
|
||||
msg.fail("Can't find converter for {}".format(converter), exits=1)
|
||||
ner_map = None
|
||||
if ner_map_path is not None:
|
||||
ner_map = srsly.read_json(ner_map_path)
|
||||
# Use converter function to convert data
|
||||
func = CONVERTERS[converter]
|
||||
data = func(
|
||||
|
@ -104,6 +109,7 @@ def convert(
|
|||
lang=lang,
|
||||
model=model,
|
||||
no_print=no_print,
|
||||
ner_map=ner_map,
|
||||
)
|
||||
if output_dir != "-":
|
||||
# Export data to a file
|
||||
|
|
|
@ -7,7 +7,8 @@ from spacy.gold import Example
|
|||
from ...gold import iob_to_biluo
|
||||
|
||||
|
||||
def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
|
||||
def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None,
|
||||
ner_map=None, **_):
|
||||
"""
|
||||
Convert conllu files into JSON format for use with train cli.
|
||||
use_morphology parameter enables appending morphology to tags, which is
|
||||
|
@ -33,7 +34,8 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
|
|||
checked_for_ner = True
|
||||
raw += example.text
|
||||
sentences.append(generate_sentence(example.token_annotation,
|
||||
has_ner_tags, MISC_NER_PATTERN))
|
||||
has_ner_tags, MISC_NER_PATTERN,
|
||||
ner_map=ner_map))
|
||||
# Real-sized documents could be extracted using the comments on the
|
||||
# conllu document
|
||||
if len(sentences) % n_sents == 0:
|
||||
|
@ -111,8 +113,12 @@ def read_conllx(input_data, use_morphology=False, n=0):
|
|||
break
|
||||
|
||||
|
||||
def simplify_tags(iob, tag_pattern):
|
||||
def extract_tags(iob, tag_pattern, ner_map=None):
|
||||
"""
|
||||
Extract tag from MISC column according to `tag_pattern` and map to final
|
||||
entity type with `ner_map` if mapping present.
|
||||
|
||||
For NorNE:
|
||||
Simplify tags obtained from the dataset in order to follow Wikipedia
|
||||
scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while
|
||||
'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to
|
||||
|
@ -126,22 +132,24 @@ def simplify_tags(iob, tag_pattern):
|
|||
prefix = tag_match.group(2)
|
||||
suffix = tag_match.group(3)
|
||||
if prefix and suffix:
|
||||
if suffix == "GPE_LOC":
|
||||
suffix = "LOC"
|
||||
elif suffix == "GPE_ORG":
|
||||
suffix = "ORG"
|
||||
elif suffix != "PER" and suffix != "LOC" and suffix != "ORG":
|
||||
suffix = "MISC"
|
||||
new_tag = prefix + "-" + suffix
|
||||
if ner_map:
|
||||
suffix = ner_map.get(suffix, suffix)
|
||||
if suffix == "":
|
||||
new_tag = "O"
|
||||
else:
|
||||
new_tag = prefix + "-" + suffix
|
||||
new_iob.append(new_tag)
|
||||
return new_iob
|
||||
|
||||
|
||||
def generate_sentence(token_annotation, has_ner_tags, tag_pattern):
|
||||
def generate_sentence(token_annotation, has_ner_tags, tag_pattern,
|
||||
ner_map=None):
|
||||
sentence = {}
|
||||
tokens = []
|
||||
if has_ner_tags:
|
||||
iob = simplify_tags(token_annotation.entities, tag_pattern)
|
||||
iob = extract_tags(token_annotation.entities, tag_pattern,
|
||||
ner_map=ner_map)
|
||||
biluo = iob_to_biluo(iob)
|
||||
for i, id in enumerate(token_annotation.ids):
|
||||
token = {}
|
||||
|
|
|
@ -9,7 +9,7 @@ from spacy.cli.pretrain import make_docs
|
|||
|
||||
|
||||
def test_cli_converters_conllu2json():
|
||||
# https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
|
||||
# from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
|
||||
lines = [
|
||||
"1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO",
|
||||
"2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tB-PER",
|
||||
|
@ -32,17 +32,16 @@ def test_cli_converters_conllu2json():
|
|||
assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"]
|
||||
|
||||
|
||||
def test_cli_converters_conllu2json():
|
||||
# https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
|
||||
def test_cli_converters_conllu2json_name_ner_map():
|
||||
lines = [
|
||||
"1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O",
|
||||
"2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER",
|
||||
"3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER",
|
||||
"4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O",
|
||||
"5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O",
|
||||
"5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=B-BAD",
|
||||
]
|
||||
input_data = "\n".join(lines)
|
||||
converted = conllu2json(input_data, n_sents=1)
|
||||
converted = conllu2json(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""})
|
||||
assert len(converted) == 1
|
||||
assert converted[0]["id"] == 0
|
||||
assert len(converted[0]["paragraphs"]) == 1
|
||||
|
@ -55,7 +54,7 @@ def test_cli_converters_conllu2json():
|
|||
assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"]
|
||||
assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1]
|
||||
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"]
|
||||
assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O", "O"]
|
||||
assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"]
|
||||
|
||||
|
||||
def test_cli_converters_iob2json():
|
||||
|
|
Loading…
Reference in New Issue