diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index fa867fa04..0cc0693a8 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -39,6 +39,7 @@ FILE_TYPES_STDOUT = ("json", "jsonl") converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str), lang=("Language (if tokenizer required)", "option", "l", str), morphology=("Enable appending morphology to tags", "flag", "m", bool), + ner_map_path=("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path), ) def convert( input_file, @@ -49,6 +50,7 @@ def convert( model=None, morphology=False, converter="auto", + ner_map_path=None, lang=None, ): """ @@ -94,6 +96,9 @@ def convert( ) if converter not in CONVERTERS: msg.fail("Can't find converter for {}".format(converter), exits=1) + ner_map = None + if ner_map_path is not None: + ner_map = srsly.read_json(ner_map_path) # Use converter function to convert data func = CONVERTERS[converter] data = func( @@ -104,6 +109,7 @@ def convert( lang=lang, model=model, no_print=no_print, + ner_map=ner_map, ) if output_dir != "-": # Export data to a file diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index dc68efef4..0699bb5c1 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -7,7 +7,8 @@ from spacy.gold import Example from ...gold import iob_to_biluo -def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_): +def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, + ner_map=None, **_): """ Convert conllu files into JSON format for use with train cli. use_morphology parameter enables appending morphology to tags, which is @@ -33,7 +34,8 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_): checked_for_ner = True raw += example.text sentences.append(generate_sentence(example.token_annotation, - has_ner_tags, MISC_NER_PATTERN)) + has_ner_tags, MISC_NER_PATTERN, + ner_map=ner_map)) # Real-sized documents could be extracted using the comments on the # conllu document if len(sentences) % n_sents == 0: @@ -111,8 +113,12 @@ def read_conllx(input_data, use_morphology=False, n=0): break -def simplify_tags(iob, tag_pattern): +def extract_tags(iob, tag_pattern, ner_map=None): """ + Extract tag from MISC column according to `tag_pattern` and map to final + entity type with `ner_map` if mapping present. + + For NorNE: Simplify tags obtained from the dataset in order to follow Wikipedia scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while 'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to @@ -126,22 +132,24 @@ def simplify_tags(iob, tag_pattern): prefix = tag_match.group(2) suffix = tag_match.group(3) if prefix and suffix: - if suffix == "GPE_LOC": - suffix = "LOC" - elif suffix == "GPE_ORG": - suffix = "ORG" - elif suffix != "PER" and suffix != "LOC" and suffix != "ORG": - suffix = "MISC" new_tag = prefix + "-" + suffix + if ner_map: + suffix = ner_map.get(suffix, suffix) + if suffix == "": + new_tag = "O" + else: + new_tag = prefix + "-" + suffix new_iob.append(new_tag) return new_iob -def generate_sentence(token_annotation, has_ner_tags, tag_pattern): +def generate_sentence(token_annotation, has_ner_tags, tag_pattern, + ner_map=None): sentence = {} tokens = [] if has_ner_tags: - iob = simplify_tags(token_annotation.entities, tag_pattern) + iob = extract_tags(token_annotation.entities, tag_pattern, + ner_map=ner_map) biluo = iob_to_biluo(iob) for i, id in enumerate(token_annotation.ids): token = {} diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 2ce76b9ba..3b75e760a 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -9,7 +9,7 @@ from spacy.cli.pretrain import make_docs def test_cli_converters_conllu2json(): - # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu + # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu lines = [ "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO", "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tB-PER", @@ -32,17 +32,16 @@ def test_cli_converters_conllu2json(): assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"] -def test_cli_converters_conllu2json(): - # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu +def test_cli_converters_conllu2json_name_ner_map(): lines = [ "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O", "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER", "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER", "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O", - "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O", + "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=B-BAD", ] input_data = "\n".join(lines) - converted = conllu2json(input_data, n_sents=1) + converted = conllu2json(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""}) assert len(converted) == 1 assert converted[0]["id"] == 0 assert len(converted[0]["paragraphs"]) == 1 @@ -55,7 +54,7 @@ def test_cli_converters_conllu2json(): assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"] assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1] assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"] - assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O", "O"] + assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"] def test_cli_converters_iob2json():