mirror of https://github.com/explosion/spaCy.git
* Fix head alignment in read_conll.parse, which was causing corrupt parses when strip_bad_periods=True. A similar problem may apply to other data readers.
This commit is contained in:
parent
f868175e43
commit
60d26243e3
|
@ -10,11 +10,12 @@ def parse(sent_text, strip_bad_periods=False):
|
||||||
assert sent_text
|
assert sent_text
|
||||||
annot = []
|
annot = []
|
||||||
words = []
|
words = []
|
||||||
id_map = {}
|
id_map = {-1: -1}
|
||||||
for i, line in enumerate(sent_text.split('\n')):
|
for i, line in enumerate(sent_text.split('\n')):
|
||||||
word, tag, head, dep = _parse_line(line)
|
word, tag, head, dep = _parse_line(line)
|
||||||
if strip_bad_periods and words and _is_bad_period(words[-1], word):
|
if strip_bad_periods and words and _is_bad_period(words[-1], word):
|
||||||
continue
|
continue
|
||||||
|
id_map[i] = len(words)
|
||||||
|
|
||||||
annot.append({
|
annot.append({
|
||||||
'id': len(words),
|
'id': len(words),
|
||||||
|
@ -23,6 +24,8 @@ def parse(sent_text, strip_bad_periods=False):
|
||||||
'head': int(head) - 1,
|
'head': int(head) - 1,
|
||||||
'dep': dep})
|
'dep': dep})
|
||||||
words.append(word)
|
words.append(word)
|
||||||
|
for entry in annot:
|
||||||
|
entry['head'] = id_map[entry['head']]
|
||||||
return words, annot
|
return words, annot
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue