Update conllu2json MISC column handling (#4715)

Update converter to handle various things in MISC column:

* `SpaceAfter=No` and set raw text accordingly
* plain NER tag
* name=NER (for NorNE)
This commit is contained in:
adrianeboyd 2019-11-26 16:10:08 +01:00 committed by Matthew Honnibal
parent 9aab0a55e1
commit 9efd3ccbef
2 changed files with 70 additions and 25 deletions

View File

@ -18,21 +18,28 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
""" """
# by @dvsrepo, via #11 explosion/spacy-dev-resources # by @dvsrepo, via #11 explosion/spacy-dev-resources
# by @katarkor # by @katarkor
# name=NER is to handle NorNE
MISC_NER_PATTERN = "\|?(?:name=)?(([A-Z_]+)-([A-Z_]+)|O)\|?"
docs = [] docs = []
raw = ""
sentences = [] sentences = []
conll_data = read_conllx(input_data, use_morphology=use_morphology) conll_data = read_conllx(input_data, use_morphology=use_morphology)
checked_for_ner = False checked_for_ner = False
has_ner_tags = False has_ner_tags = False
for i, example in enumerate(conll_data): for i, example in enumerate(conll_data):
if not checked_for_ner: if not checked_for_ner:
has_ner_tags = is_ner(example.token_annotation.entities[0]) has_ner_tags = is_ner(example.token_annotation.entities[0],
MISC_NER_PATTERN)
checked_for_ner = True checked_for_ner = True
sentences.append(generate_sentence(example.token_annotation, has_ner_tags)) raw += example.text
sentences.append(generate_sentence(example.token_annotation,
has_ner_tags, MISC_NER_PATTERN))
# Real-sized documents could be extracted using the comments on the # Real-sized documents could be extracted using the comments on the
# conllu document # conllu document
if len(sentences) % n_sents == 0: if len(sentences) % n_sents == 0:
doc = create_doc(sentences, i) doc = create_doc(raw, sentences, i)
docs.append(doc) docs.append(doc)
raw = ""
sentences = [] sentences = []
if sentences: if sentences:
doc = create_doc(sentences, i) doc = create_doc(sentences, i)
@ -40,12 +47,12 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
return docs return docs
def is_ner(tag): def is_ner(tag, tag_pattern):
""" """
Check the 10th column of the first token to determine if the file contains Check the 10th column of the first token to determine if the file contains
NER tags NER tags
""" """
tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag) tag_match = re.search(tag_pattern, tag)
if tag_match: if tag_match:
return True return True
elif tag == "O": elif tag == "O":
@ -63,9 +70,10 @@ def read_conllx(input_data, use_morphology=False, n=0):
while lines[0].startswith("#"): while lines[0].startswith("#"):
lines.pop(0) lines.pop(0)
ids, words, tags, heads, deps, ents = [], [], [], [], [], [] ids, words, tags, heads, deps, ents = [], [], [], [], [], []
spaces = []
for line in lines: for line in lines:
parts = line.split("\t") parts = line.split("\t")
id_, word, lemma, pos, tag, morph, head, dep, _1, iob = parts id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
if "-" in id_ or "." in id_: if "-" in id_ or "." in id_:
continue continue
try: try:
@ -74,18 +82,27 @@ def read_conllx(input_data, use_morphology=False, n=0):
dep = "ROOT" if dep == "root" else dep dep = "ROOT" if dep == "root" else dep
tag = pos if tag == "_" else tag tag = pos if tag == "_" else tag
tag = tag + "__" + morph if use_morphology else tag tag = tag + "__" + morph if use_morphology else tag
iob = iob if iob else "O" ent = misc if misc else "O"
ids.append(id_) ids.append(id_)
words.append(word) words.append(word)
tags.append(tag) tags.append(tag)
heads.append(head) heads.append(head)
deps.append(dep) deps.append(dep)
ents.append(iob) ents.append(ent)
if "SpaceAfter=No" in misc:
spaces.append(False)
else:
spaces.append(True)
except: # noqa: E722 except: # noqa: E722
print(line) print(line)
raise raise
example = Example(doc=None) raw = ""
for word, space in zip(words, spaces):
raw += word
if space:
raw += " "
example = Example(doc=raw)
example.set_token_annotation(ids=ids, words=words, tags=tags, example.set_token_annotation(ids=ids, words=words, tags=tags,
heads=heads, deps=deps, entities=ents) heads=heads, deps=deps, entities=ents)
yield example yield example
@ -94,7 +111,7 @@ def read_conllx(input_data, use_morphology=False, n=0):
break break
def simplify_tags(iob): def simplify_tags(iob, tag_pattern):
""" """
Simplify tags obtained from the dataset in order to follow Wikipedia Simplify tags obtained from the dataset in order to follow Wikipedia
scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while
@ -103,26 +120,28 @@ def simplify_tags(iob):
""" """
new_iob = [] new_iob = []
for tag in iob: for tag in iob:
tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag) tag_match = re.search(tag_pattern, tag)
new_tag = "O"
if tag_match: if tag_match:
prefix = tag_match.group(1) prefix = tag_match.group(2)
suffix = tag_match.group(2) suffix = tag_match.group(3)
if suffix == "GPE_LOC": if prefix and suffix:
suffix = "LOC" if suffix == "GPE_LOC":
elif suffix == "GPE_ORG": suffix = "LOC"
suffix = "ORG" elif suffix == "GPE_ORG":
elif suffix != "PER" and suffix != "LOC" and suffix != "ORG": suffix = "ORG"
suffix = "MISC" elif suffix != "PER" and suffix != "LOC" and suffix != "ORG":
tag = prefix + "-" + suffix suffix = "MISC"
new_iob.append(tag) new_tag = prefix + "-" + suffix
new_iob.append(new_tag)
return new_iob return new_iob
def generate_sentence(token_annotation, has_ner_tags): def generate_sentence(token_annotation, has_ner_tags, tag_pattern):
sentence = {} sentence = {}
tokens = [] tokens = []
if has_ner_tags: if has_ner_tags:
iob = simplify_tags(token_annotation.entities) iob = simplify_tags(token_annotation.entities, tag_pattern)
biluo = iob_to_biluo(iob) biluo = iob_to_biluo(iob)
for i, id in enumerate(token_annotation.ids): for i, id in enumerate(token_annotation.ids):
token = {} token = {}
@ -138,11 +157,12 @@ def generate_sentence(token_annotation, has_ner_tags):
return sentence return sentence
def create_doc(sentences, id): def create_doc(raw, sentences, id):
doc = {} doc = {}
paragraph = {} paragraph = {}
doc["id"] = id doc["id"] = id
doc["paragraphs"] = [] doc["paragraphs"] = []
paragraph["raw"] = raw.strip()
paragraph["sentences"] = sentences paragraph["sentences"] = sentences
doc["paragraphs"].append(paragraph) doc["paragraphs"].append(paragraph)
return doc return doc

View File

@ -32,6 +32,32 @@ def test_cli_converters_conllu2json():
assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"] assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"]
def test_cli_converters_conllu2json():
# https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
lines = [
"1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O",
"2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER",
"3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER",
"4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O",
"5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O",
]
input_data = "\n".join(lines)
converted = conllu2json(input_data, n_sents=1)
assert len(converted) == 1
assert converted[0]["id"] == 0
assert len(converted[0]["paragraphs"]) == 1
assert converted[0]["paragraphs"][0]["raw"] == "Dommer FinnEilertsen avstår."
assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
sent = converted[0]["paragraphs"][0]["sentences"][0]
assert len(sent["tokens"]) == 5
tokens = sent["tokens"]
assert [t["orth"] for t in tokens] == ["Dommer", "Finn", "Eilertsen", "avstår", "."]
assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"]
assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1]
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"]
assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O", "O"]
def test_cli_converters_iob2json(): def test_cli_converters_iob2json():
lines = [ lines = [
"I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O", "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
@ -106,7 +132,6 @@ def test_cli_converters_conll_ner2json():
] ]
input_data = "\n".join(lines) input_data = "\n".join(lines)
converted = conll_ner2json(input_data, n_sents=10) converted = conll_ner2json(input_data, n_sents=10)
print(converted)
assert len(converted) == 1 assert len(converted) == 1
assert converted[0]["id"] == 0 assert converted[0]["id"] == 0
assert len(converted[0]["paragraphs"]) == 1 assert len(converted[0]["paragraphs"]) == 1