* Check NER length matches conll length in prepare_treebank

This commit is contained in:
Matthew Honnibal 2015-05-29 03:54:06 +02:00
parent b76bbbd12c
commit 784e577f45
1 changed files with 4 additions and 0 deletions

View File

@ -86,6 +86,9 @@ def format_para(raw_text, ptb_sents, dep_sents, ner_sents):
_, ner = read_ner.parse(ner_text, strip_bad_periods=True) _, ner = read_ner.parse(ner_text, strip_bad_periods=True)
else: else:
ner = ['-' for _ in annot] ner = ['-' for _ in annot]
# Necessary because the ClearNLP converter deletes EDITED words.
if len(ner) != len(annot):
ner = ['-' for _ in annot]
for token_id, (token, token_ent) in enumerate(zip(annot, ner)): for token_id, (token, token_ent) in enumerate(zip(annot, ner)):
para['tokens'].append(format_token(offset, token_id, token, token_ent)) para['tokens'].append(format_token(offset, token_id, token, token_ent))
@ -102,6 +105,7 @@ def format_para(raw_text, ptb_sents, dep_sents, ner_sents):
def format_token(offset, token_id, token, ner): def format_token(offset, token_id, token, ner):
assert token_id == token['id']
head = (token['head'] + offset) if token['head'] != -1 else -1 head = (token['head'] + offset) if token['head'] != -1 else -1
return { return {
'id': offset + token_id, 'id': offset + token_id,