spaCy/spacy/munge/read_conll.py

41 lines
912 B
Python

from __future__ import unicode_literals
def split(text):
return [sent.strip() for sent in text.split('\n\n') if sent.strip()]
def parse(sent_text, strip_bad_periods=False):
sent_text = sent_text.strip()
assert sent_text
annot = []
words = []
i = 0
for line in sent_text.split('\n'):
word, tag, head, dep = line.split()
if strip_bad_periods and words and _is_bad_period(words[-1], word):
continue
annot.append({
'id': i,
'word': word,
'tag': tag,
'head': int(head) - 1 if int(head) != 0 else i,
'dep': dep})
words.append(word)
i += 1
return words, annot
def _is_bad_period(prev, period):
if period != '.':
return False
elif prev == '.':
return False
elif not prev.endswith('.'):
return False
else:
return True