* Clean unused functions from spacy.syntax.conll

2015-05-24 20:06:46 +02:00 · 2015-05-24 20:06:46 +02:00 · efe7a7d7d6
parent 78487f3e66
commit efe7a7d7d6
1 changed files with 0 additions and 77 deletions
--- a/spacy/syntax/conll.pyx
+++ b/spacy/syntax/conll.pyx
@ -32,69 +32,6 @@ def read_json_file(loc):
    return paragraphs
 def read_conll03_file(loc):
    sents = []
    text = codecs.open(loc, 'r', 'utf8').read().strip()
    for doc in text.split('-DOCSTART- -X- O O'):
        doc = doc.strip()
        if not doc:
            continue
        for sent_str in doc.split('\n\n'):
            words = []
            tags = []
            iob_ents = []
            ids = []
            lines = sent_str.strip().split('\n')
            idx = 0
            for line in lines:
                word, tag, chunk, iob = line.split()
                if tag == '"':
                    tag = '``'
                if '|' in tag:
                    tag = tag.split('|')[0]
                words.append(word)
                tags.append(tag)
                iob_ents.append(iob)
                ids.append(idx)
                idx += len(word) + 1
            heads = [-1] * len(words)
            labels = ['ROOT'] * len(words)
            sents.append((' '.join(words), [words],
                         (ids, words, tags, heads, labels, _iob_to_biluo(iob_ents))))
    return sents
 def read_docparse_file(loc):
    sents = []
    for sent_str in codecs.open(loc, 'r', 'utf8').read().strip().split('\n\n'):
        words = []
        heads = []
        labels = []
        tags = []
        ids = []
        iob_ents = []
        lines = sent_str.strip().split('\n')
        raw_text = lines.pop(0).strip()
        tok_text = lines.pop(0).strip()
        for i, line in enumerate(lines):
            id_, word, pos_string, head_idx, label, iob_ent = _parse_line(line)
            if label == 'root':
                label = 'ROOT'
            words.append(word)
            if head_idx < 0:
                head_idx = id_
            ids.append(id_)
            heads.append(head_idx)
            labels.append(label)
            tags.append(pos_string)
            iob_ents.append(iob_ent)
        tokenized = [s.replace('<SEP>', ' ').split(' ')
                     for s in tok_text.split('<SENT>')]
        tuples = (ids, words, tags, heads, labels, iob_ents)
        sents.append((raw_text, tokenized, tuples, []))
    return sents
 def _iob_to_biluo(tags):
    out = []
    curr_label = None
@ -128,20 +65,6 @@ def _consume_ent(tags):
        return [start] + middle + [end]
 def _parse_line(line):
    pieces = line.split()
    if len(pieces) == 4:
        return 0, pieces[0], pieces[1], int(pieces[2]) - 1, pieces[3]
    else:
        id_ = int(pieces[0])
        word = pieces[1]
        pos = pieces[3]
        iob_ent = pieces[5]
        head_idx = int(pieces[6])
        label = pieces[7]
        return id_, word, pos, head_idx, label, iob_ent
 cdef class GoldParse:
    def __init__(self, tokens, annot_tuples, brackets=tuple()):
        self.mem = Pool()