From 32ae2cdabe9da4aa924637634a3ecbf2b8374824 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 26 May 2015 19:52:39 +0200 Subject: [PATCH] * In prepare_treebank, move ner into the token descriptions --- bin/prepare_treebank.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/bin/prepare_treebank.py b/bin/prepare_treebank.py index b84277a06..acd544944 100644 --- a/bin/prepare_treebank.py +++ b/bin/prepare_treebank.py @@ -59,8 +59,9 @@ def format_doc(file_id, raw_paras, ptb_text, dep_text, ner_text): _, annot = read_conll.parse(dep_sents[i], strip_bad_periods=True) if ner_sents is not None: _, ner = read_ner.parse(ner_sents[i], strip_bad_periods=True) + assert len(ner) == len(annot) else: - ner = None + ner = ['-' for _ in annot] for token_id, token in enumerate(annot): try: head = (token['head'] + offset) if token['head'] != -1 else -1 @@ -69,16 +70,10 @@ def format_doc(file_id, raw_paras, ptb_text, dep_text, ner_text): 'orth': token['word'], 'tag': token['tag'], 'head': head, - 'dep': token['dep']}) + 'dep': token['dep'], + 'ner': ner[token_id]}) except: raise - if ner is not None: - for label, start, end in ner: - if start != end: - para['entities'].append({ - 'label': label, - 'first': start + offset, - 'last': (end-1) + offset}) for label, start, end in brackets: if start != end: para['brackets'].append({