Add NER map option to convert CLI (#4763)

Instead of a hard-coded NER tag simplification function that was only intended for NorNE, map NER tags in CoNLL-U converter using a dict provided as JSON as a command-line option. Map NER entity types or new tag or to "" for 'O', e.g.: ``` {"PER": "PERSON", "BAD": ""} => B-PER -> B-PERSON B-BAD -> O ```
2019-12-11 18:20:49 +01:00 · 2019-12-11 18:20:49 +01:00 · eb9b1858c4
parent 68f711b409
commit eb9b1858c4
3 changed files with 30 additions and 17 deletions
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -39,6 +39,7 @@ FILE_TYPES_STDOUT = ("json", "jsonl")
    converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str),
    lang=("Language (if tokenizer required)", "option", "l", str),
    morphology=("Enable appending morphology to tags", "flag", "m", bool),
+    ner_map_path=("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path),
 )
 def convert(
    input_file,
@ -49,6 +50,7 @@ def convert(
    model=None,
    morphology=False,
    converter="auto",
+    ner_map_path=None,
    lang=None,
 ):
    """
@ -94,6 +96,9 @@ def convert(
            )
    if converter not in CONVERTERS:
        msg.fail("Can't find converter for {}".format(converter), exits=1)
+    ner_map = None
+    if ner_map_path is not None:
+        ner_map = srsly.read_json(ner_map_path)
    # Use converter function to convert data
    func = CONVERTERS[converter]
    data = func(
@ -104,6 +109,7 @@ def convert(
        lang=lang,
        model=model,
        no_print=no_print,
+        ner_map=ner_map,
    )
    if output_dir != "-":
        # Export data to a file
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@ -7,7 +7,8 @@ from spacy.gold import Example
 from ...gold import iob_to_biluo


-def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
+def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None,
+                ner_map=None, **_):
    """
    Convert conllu files into JSON format for use with train cli.
    use_morphology parameter enables appending morphology to tags, which is
@ -33,7 +34,8 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
            checked_for_ner = True
        raw += example.text
        sentences.append(generate_sentence(example.token_annotation,
-                has_ner_tags, MISC_NER_PATTERN))
+                has_ner_tags, MISC_NER_PATTERN,
+                ner_map=ner_map))
        # Real-sized documents could be extracted using the comments on the
        # conllu document
        if len(sentences) % n_sents == 0:
@ -111,8 +113,12 @@ def read_conllx(input_data, use_morphology=False, n=0):
                break


-def simplify_tags(iob, tag_pattern):
+def extract_tags(iob, tag_pattern, ner_map=None):
    """
+    Extract tag from MISC column according to `tag_pattern` and map to final
+    entity type with `ner_map` if mapping present.
+
+    For NorNE:
    Simplify tags obtained from the dataset in order to follow Wikipedia
    scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while
    'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to
@ -126,22 +132,24 @@ def simplify_tags(iob, tag_pattern):
            prefix = tag_match.group(2)
            suffix = tag_match.group(3)
            if prefix and suffix:
-                if suffix == "GPE_LOC":
-                    suffix = "LOC"
-                elif suffix == "GPE_ORG":
-                    suffix = "ORG"
-                elif suffix != "PER" and suffix != "LOC" and suffix != "ORG":
-                    suffix = "MISC"
                new_tag = prefix + "-" + suffix
+                if ner_map:
+                    suffix = ner_map.get(suffix, suffix)
+                    if suffix == "":
+                        new_tag = "O"
+                    else:
+                        new_tag = prefix + "-" + suffix
        new_iob.append(new_tag)
    return new_iob


-def generate_sentence(token_annotation, has_ner_tags, tag_pattern):
+def generate_sentence(token_annotation, has_ner_tags, tag_pattern,
+                      ner_map=None):
    sentence = {}
    tokens = []
    if has_ner_tags:
-        iob = simplify_tags(token_annotation.entities, tag_pattern)
+        iob = extract_tags(token_annotation.entities, tag_pattern,
+                            ner_map=ner_map)
        biluo = iob_to_biluo(iob)
    for i, id in enumerate(token_annotation.ids):
        token = {}
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -9,7 +9,7 @@ from spacy.cli.pretrain import make_docs


 def test_cli_converters_conllu2json():
-    # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
+    # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
    lines = [
        "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO",
        "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tB-PER",
@ -32,17 +32,16 @@ def test_cli_converters_conllu2json():
    assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"]


-def test_cli_converters_conllu2json():
-    # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
+def test_cli_converters_conllu2json_name_ner_map():
    lines = [
        "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O",
        "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER",
        "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER",
        "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O",
-        "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O",
+        "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=B-BAD",
    ]
    input_data = "\n".join(lines)
-    converted = conllu2json(input_data, n_sents=1)
+    converted = conllu2json(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""})
    assert len(converted) == 1
    assert converted[0]["id"] == 0
    assert len(converted[0]["paragraphs"]) == 1
@ -55,7 +54,7 @@ def test_cli_converters_conllu2json():
    assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"]
    assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1]
    assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"]
-    assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O", "O"]
+    assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"]


 def test_cli_converters_iob2json():