diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index ecdc2ae66..0b2920802 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -26,7 +26,7 @@ def conllu2json( Extract NER tags if available and convert them so that they follow BILUO and the Wikipedia scheme """ - MISC_NER_PATTERN = "\|?(?:name=)?(([A-Z_]+)-([A-Z_]+)|O)\|?" + MISC_NER_PATTERN = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$" msg = Printer(no_print=no_print) n_sents_info(msg, n_sents) docs = [] @@ -39,7 +39,7 @@ def conllu2json( ner_map=ner_map, merge_subtokens=merge_subtokens, ) - has_ner_tags = has_ner(input_data, ner_tag_pattern=MISC_NER_PATTERN) + has_ner_tags = has_ner(input_data, MISC_NER_PATTERN) for i, example in enumerate(conll_data): raw += example.text sentences.append( @@ -65,21 +65,20 @@ def conllu2json( def has_ner(input_data, ner_tag_pattern): """ - Check the 10th column of the first token to determine if the file contains - NER tags + Check the MISC column for NER tags. """ for sent in input_data.strip().split("\n\n"): lines = sent.strip().split("\n") if lines: while lines[0].startswith("#"): lines.pop(0) - if lines: - parts = lines[0].split("\t") + for line in lines: + parts = line.split("\t") id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts - if re.search(ner_tag_pattern, misc): - return True - else: - return False + for misc_part in misc.split("|"): + if re.match(ner_tag_pattern, misc_part): + return True + return False def read_conllx( @@ -127,19 +126,21 @@ def get_entities(lines, tag_pattern, ner_map=None): iob = [] for misc in miscs: - tag_match = re.search(tag_pattern, misc) iob_tag = "O" - if tag_match: - prefix = tag_match.group(2) - suffix = tag_match.group(3) - if prefix and suffix: - iob_tag = prefix + "-" + suffix - if ner_map: - suffix = ner_map.get(suffix, suffix) - if suffix == "": - iob_tag = "O" - else: - iob_tag = prefix + "-" + suffix + for misc_part in misc.split("|"): + tag_match = re.match(tag_pattern, misc_part) + if tag_match: + prefix = tag_match.group(2) + suffix = tag_match.group(3) + if prefix and suffix: + iob_tag = prefix + "-" + suffix + if ner_map: + suffix = ner_map.get(suffix, suffix) + if suffix == "": + iob_tag = "O" + else: + iob_tag = prefix + "-" + suffix + break iob.append(iob_tag) return iob_to_biluo(iob) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 306adc881..132f7ac9f 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -29,14 +29,26 @@ def test_cli_converters_conllu2json(): assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"] -def test_cli_converters_conllu2json_name_ner_map(): - lines = [ - "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O", - "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER", - "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER", - "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O", - "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=B-BAD", - ] +@pytest.mark.parametrize( + "lines", + [ + ( + "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O", + "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER", + "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER", + "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O", + "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=B-BAD", + ), + ( + "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\t_", + "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|NE=B-PER", + "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tNE=L-PER", + "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No", + "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tNE=B-BAD", + ), + ], +) +def test_cli_converters_conllu2json_name_ner_map(lines): input_data = "\n".join(lines) converted = conllu2json(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""}) assert len(converted) == 1