spaCy/spacy/munge/read_conll.py

from __future__ import unicode_literals


def split(text):
    return [sent.strip() for sent in text.split('\n\n') if sent.strip()]


def parse(sent_text, strip_bad_periods=False):
    sent_text = sent_text.strip()
    assert sent_text
    annot = []
    words = []
    i = 0
    for line in sent_text.split('\n'):
        word, tag, head, dep = line.split()
        if strip_bad_periods and words and _is_bad_period(words[-1], word):
            continue

        annot.append({
            'id': i,
            'word': word,
            'tag': tag,
            'head': int(head) - 1 if int(head) != 0 else i,
            'dep': dep})
        words.append(word)
        i += 1
    return words, annot


def _is_bad_period(prev, period):
    if period != '.':
        return False
    elif prev == '.':
        return False
    elif not prev.endswith('.'):
        return False
    else:
        return True