diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index c0fd58fb0..7fa491b9d 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -18,21 +18,28 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_): """ # by @dvsrepo, via #11 explosion/spacy-dev-resources # by @katarkor + # name=NER is to handle NorNE + MISC_NER_PATTERN = "\|?(?:name=)?(([A-Z_]+)-([A-Z_]+)|O)\|?" docs = [] + raw = "" sentences = [] conll_data = read_conllx(input_data, use_morphology=use_morphology) checked_for_ner = False has_ner_tags = False for i, example in enumerate(conll_data): if not checked_for_ner: - has_ner_tags = is_ner(example.token_annotation.entities[0]) + has_ner_tags = is_ner(example.token_annotation.entities[0], + MISC_NER_PATTERN) checked_for_ner = True - sentences.append(generate_sentence(example.token_annotation, has_ner_tags)) + raw += example.text + sentences.append(generate_sentence(example.token_annotation, + has_ner_tags, MISC_NER_PATTERN)) # Real-sized documents could be extracted using the comments on the # conllu document if len(sentences) % n_sents == 0: - doc = create_doc(sentences, i) + doc = create_doc(raw, sentences, i) docs.append(doc) + raw = "" sentences = [] if sentences: doc = create_doc(sentences, i) @@ -40,12 +47,12 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_): return docs -def is_ner(tag): +def is_ner(tag, tag_pattern): """ Check the 10th column of the first token to determine if the file contains NER tags """ - tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag) + tag_match = re.search(tag_pattern, tag) if tag_match: return True elif tag == "O": @@ -63,9 +70,10 @@ def read_conllx(input_data, use_morphology=False, n=0): while lines[0].startswith("#"): lines.pop(0) ids, words, tags, heads, deps, ents = [], [], [], [], [], [] + spaces = [] for line in lines: parts = line.split("\t") - id_, word, lemma, pos, tag, morph, head, dep, _1, iob = parts + id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts if "-" in id_ or "." in id_: continue try: @@ -74,18 +82,27 @@ def read_conllx(input_data, use_morphology=False, n=0): dep = "ROOT" if dep == "root" else dep tag = pos if tag == "_" else tag tag = tag + "__" + morph if use_morphology else tag - iob = iob if iob else "O" + ent = misc if misc else "O" ids.append(id_) words.append(word) tags.append(tag) heads.append(head) deps.append(dep) - ents.append(iob) + ents.append(ent) + if "SpaceAfter=No" in misc: + spaces.append(False) + else: + spaces.append(True) except: # noqa: E722 print(line) raise - example = Example(doc=None) + raw = "" + for word, space in zip(words, spaces): + raw += word + if space: + raw += " " + example = Example(doc=raw) example.set_token_annotation(ids=ids, words=words, tags=tags, heads=heads, deps=deps, entities=ents) yield example @@ -94,7 +111,7 @@ def read_conllx(input_data, use_morphology=False, n=0): break -def simplify_tags(iob): +def simplify_tags(iob, tag_pattern): """ Simplify tags obtained from the dataset in order to follow Wikipedia scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while @@ -103,26 +120,28 @@ def simplify_tags(iob): """ new_iob = [] for tag in iob: - tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag) + tag_match = re.search(tag_pattern, tag) + new_tag = "O" if tag_match: - prefix = tag_match.group(1) - suffix = tag_match.group(2) - if suffix == "GPE_LOC": - suffix = "LOC" - elif suffix == "GPE_ORG": - suffix = "ORG" - elif suffix != "PER" and suffix != "LOC" and suffix != "ORG": - suffix = "MISC" - tag = prefix + "-" + suffix - new_iob.append(tag) + prefix = tag_match.group(2) + suffix = tag_match.group(3) + if prefix and suffix: + if suffix == "GPE_LOC": + suffix = "LOC" + elif suffix == "GPE_ORG": + suffix = "ORG" + elif suffix != "PER" and suffix != "LOC" and suffix != "ORG": + suffix = "MISC" + new_tag = prefix + "-" + suffix + new_iob.append(new_tag) return new_iob -def generate_sentence(token_annotation, has_ner_tags): +def generate_sentence(token_annotation, has_ner_tags, tag_pattern): sentence = {} tokens = [] if has_ner_tags: - iob = simplify_tags(token_annotation.entities) + iob = simplify_tags(token_annotation.entities, tag_pattern) biluo = iob_to_biluo(iob) for i, id in enumerate(token_annotation.ids): token = {} @@ -138,11 +157,12 @@ def generate_sentence(token_annotation, has_ner_tags): return sentence -def create_doc(sentences, id): +def create_doc(raw, sentences, id): doc = {} paragraph = {} doc["id"] = id doc["paragraphs"] = [] + paragraph["raw"] = raw.strip() paragraph["sentences"] = sentences doc["paragraphs"].append(paragraph) return doc diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 6dce649a9..2ce76b9ba 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -32,6 +32,32 @@ def test_cli_converters_conllu2json(): assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"] +def test_cli_converters_conllu2json(): + # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu + lines = [ + "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O", + "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER", + "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER", + "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O", + "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O", + ] + input_data = "\n".join(lines) + converted = conllu2json(input_data, n_sents=1) + assert len(converted) == 1 + assert converted[0]["id"] == 0 + assert len(converted[0]["paragraphs"]) == 1 + assert converted[0]["paragraphs"][0]["raw"] == "Dommer FinnEilertsen avstår." + assert len(converted[0]["paragraphs"][0]["sentences"]) == 1 + sent = converted[0]["paragraphs"][0]["sentences"][0] + assert len(sent["tokens"]) == 5 + tokens = sent["tokens"] + assert [t["orth"] for t in tokens] == ["Dommer", "Finn", "Eilertsen", "avstår", "."] + assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"] + assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1] + assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"] + assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O", "O"] + + def test_cli_converters_iob2json(): lines = [ "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O", @@ -106,7 +132,6 @@ def test_cli_converters_conll_ner2json(): ] input_data = "\n".join(lines) converted = conll_ner2json(input_data, n_sents=10) - print(converted) assert len(converted) == 1 assert converted[0]["id"] == 0 assert len(converted[0]["paragraphs"]) == 1