Add NER map option to convert CLI (#4763)

Instead of a hard-coded NER tag simplification function that was only
intended for NorNE, map NER tags in CoNLL-U converter using a dict
provided as JSON as a command-line option.

Map NER entity types or new tag or to "" for 'O', e.g.:

```
{"PER": "PERSON", "BAD": ""}

=>

B-PER -> B-PERSON
B-BAD -> O
```
This commit is contained in:
adrianeboyd 2019-12-11 18:20:49 +01:00 committed by Ines Montani
parent 68f711b409
commit eb9b1858c4
3 changed files with 30 additions and 17 deletions

View File

@ -39,6 +39,7 @@ FILE_TYPES_STDOUT = ("json", "jsonl")
converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str),
lang=("Language (if tokenizer required)", "option", "l", str),
morphology=("Enable appending morphology to tags", "flag", "m", bool),
ner_map_path=("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path),
)
def convert(
input_file,
@ -49,6 +50,7 @@ def convert(
model=None,
morphology=False,
converter="auto",
ner_map_path=None,
lang=None,
):
"""
@ -94,6 +96,9 @@ def convert(
)
if converter not in CONVERTERS:
msg.fail("Can't find converter for {}".format(converter), exits=1)
ner_map = None
if ner_map_path is not None:
ner_map = srsly.read_json(ner_map_path)
# Use converter function to convert data
func = CONVERTERS[converter]
data = func(
@ -104,6 +109,7 @@ def convert(
lang=lang,
model=model,
no_print=no_print,
ner_map=ner_map,
)
if output_dir != "-":
# Export data to a file

View File

@ -7,7 +7,8 @@ from spacy.gold import Example
from ...gold import iob_to_biluo
def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None,
ner_map=None, **_):
"""
Convert conllu files into JSON format for use with train cli.
use_morphology parameter enables appending morphology to tags, which is
@ -33,7 +34,8 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
checked_for_ner = True
raw += example.text
sentences.append(generate_sentence(example.token_annotation,
has_ner_tags, MISC_NER_PATTERN))
has_ner_tags, MISC_NER_PATTERN,
ner_map=ner_map))
# Real-sized documents could be extracted using the comments on the
# conllu document
if len(sentences) % n_sents == 0:
@ -111,8 +113,12 @@ def read_conllx(input_data, use_morphology=False, n=0):
break
def simplify_tags(iob, tag_pattern):
def extract_tags(iob, tag_pattern, ner_map=None):
"""
Extract tag from MISC column according to `tag_pattern` and map to final
entity type with `ner_map` if mapping present.
For NorNE:
Simplify tags obtained from the dataset in order to follow Wikipedia
scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while
'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to
@ -126,22 +132,24 @@ def simplify_tags(iob, tag_pattern):
prefix = tag_match.group(2)
suffix = tag_match.group(3)
if prefix and suffix:
if suffix == "GPE_LOC":
suffix = "LOC"
elif suffix == "GPE_ORG":
suffix = "ORG"
elif suffix != "PER" and suffix != "LOC" and suffix != "ORG":
suffix = "MISC"
new_tag = prefix + "-" + suffix
if ner_map:
suffix = ner_map.get(suffix, suffix)
if suffix == "":
new_tag = "O"
else:
new_tag = prefix + "-" + suffix
new_iob.append(new_tag)
return new_iob
def generate_sentence(token_annotation, has_ner_tags, tag_pattern):
def generate_sentence(token_annotation, has_ner_tags, tag_pattern,
ner_map=None):
sentence = {}
tokens = []
if has_ner_tags:
iob = simplify_tags(token_annotation.entities, tag_pattern)
iob = extract_tags(token_annotation.entities, tag_pattern,
ner_map=ner_map)
biluo = iob_to_biluo(iob)
for i, id in enumerate(token_annotation.ids):
token = {}

View File

@ -9,7 +9,7 @@ from spacy.cli.pretrain import make_docs
def test_cli_converters_conllu2json():
# https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
# from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
lines = [
"1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO",
"2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tB-PER",
@ -32,17 +32,16 @@ def test_cli_converters_conllu2json():
assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"]
def test_cli_converters_conllu2json():
# https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
def test_cli_converters_conllu2json_name_ner_map():
lines = [
"1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O",
"2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER",
"3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER",
"4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O",
"5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O",
"5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=B-BAD",
]
input_data = "\n".join(lines)
converted = conllu2json(input_data, n_sents=1)
converted = conllu2json(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""})
assert len(converted) == 1
assert converted[0]["id"] == 0
assert len(converted[0]["paragraphs"]) == 1
@ -55,7 +54,7 @@ def test_cli_converters_conllu2json():
assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"]
assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1]
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"]
assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O", "O"]
assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"]
def test_cli_converters_iob2json():