From 60d26243e3a18982608c99d3b0d1dfa107e25ab1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 18 Jun 2015 16:35:27 +0200 Subject: [PATCH] * Fix head alignment in read_conll.parse, which was causing corrupt parses when strip_bad_periods=True. A similar problem may apply to other data readers. --- spacy/munge/read_conll.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/munge/read_conll.py b/spacy/munge/read_conll.py index ed6037a4d..a120ea497 100644 --- a/spacy/munge/read_conll.py +++ b/spacy/munge/read_conll.py @@ -10,11 +10,12 @@ def parse(sent_text, strip_bad_periods=False): assert sent_text annot = [] words = [] - id_map = {} + id_map = {-1: -1} for i, line in enumerate(sent_text.split('\n')): word, tag, head, dep = _parse_line(line) if strip_bad_periods and words and _is_bad_period(words[-1], word): continue + id_map[i] = len(words) annot.append({ 'id': len(words), @@ -23,6 +24,8 @@ def parse(sent_text, strip_bad_periods=False): 'head': int(head) - 1, 'dep': dep}) words.append(word) + for entry in annot: + entry['head'] = id_map[entry['head']] return words, annot