* Fix head alignment in read_conll.parse, which was causing corrupt parses when strip_bad_periods=True. A similar problem may apply to other data readers.

2015-06-18 16:35:27 +02:00 · 2015-06-18 16:35:27 +02:00 · 60d26243e3
parent f868175e43
commit 60d26243e3
1 changed files with 4 additions and 1 deletions
--- a/spacy/munge/read_conll.py
+++ b/spacy/munge/read_conll.py
@ -10,11 +10,12 @@ def parse(sent_text, strip_bad_periods=False):
    assert sent_text
    annot = []
    words = []
-    id_map = {}
+    id_map = {-1: -1}
    for i, line in enumerate(sent_text.split('\n')):
        word, tag, head, dep = _parse_line(line)
        if strip_bad_periods and words and _is_bad_period(words[-1], word):
            continue
+        id_map[i] = len(words)
  
        annot.append({
            'id': len(words),
@ -23,6 +24,8 @@ def parse(sent_text, strip_bad_periods=False):
            'head': int(head) - 1,
            'dep': dep})
        words.append(word)
+    for entry in annot:
+        entry['head'] = id_map[entry['head']]
    return words, annot