mirror of https://github.com/explosion/spaCy.git
* Check NER length matches conll length in prepare_treebank
This commit is contained in:
parent
b76bbbd12c
commit
784e577f45
|
@ -86,6 +86,9 @@ def format_para(raw_text, ptb_sents, dep_sents, ner_sents):
|
||||||
_, ner = read_ner.parse(ner_text, strip_bad_periods=True)
|
_, ner = read_ner.parse(ner_text, strip_bad_periods=True)
|
||||||
else:
|
else:
|
||||||
ner = ['-' for _ in annot]
|
ner = ['-' for _ in annot]
|
||||||
|
# Necessary because the ClearNLP converter deletes EDITED words.
|
||||||
|
if len(ner) != len(annot):
|
||||||
|
ner = ['-' for _ in annot]
|
||||||
for token_id, (token, token_ent) in enumerate(zip(annot, ner)):
|
for token_id, (token, token_ent) in enumerate(zip(annot, ner)):
|
||||||
para['tokens'].append(format_token(offset, token_id, token, token_ent))
|
para['tokens'].append(format_token(offset, token_id, token, token_ent))
|
||||||
|
|
||||||
|
@ -102,6 +105,7 @@ def format_para(raw_text, ptb_sents, dep_sents, ner_sents):
|
||||||
|
|
||||||
|
|
||||||
def format_token(offset, token_id, token, ner):
|
def format_token(offset, token_id, token, ner):
|
||||||
|
assert token_id == token['id']
|
||||||
head = (token['head'] + offset) if token['head'] != -1 else -1
|
head = (token['head'] + offset) if token['head'] != -1 else -1
|
||||||
return {
|
return {
|
||||||
'id': offset + token_id,
|
'id': offset + token_id,
|
||||||
|
|
Loading…
Reference in New Issue