Remove unused munge package

2017-05-09 00:16:16 +02:00 · 2017-05-09 00:16:16 +02:00 · 7b83977020
parent c714841cc8
commit 7b83977020
6 changed files with 0 additions and 519 deletions
--- a/spacy/munge/init.py
+++ b/spacy/munge/init.py
--- a/spacy/munge/align_raw.py
+++ b/spacy/munge/align_raw.py
@ -1,242 +0,0 @@
 """Align the raw sentences from Read et al (2012) to the PTB tokenization,
 outputting as a .json file. Used in bin/prepare_treebank.py
 """
 from __future__ import unicode_literals
 import plac
 from pathlib import Path
 import json
 from os import path
 import os
 from spacy.munge import read_ptb
 from spacy.munge.read_ontonotes import sgml_extract
 def read_odc(section_loc):
    # Arbitrary patches applied to the _raw_ text to promote alignment.
    patches = (
        ('. . . .', '...'),
        ('....', '...'),
        ('Co..', 'Co.'),
        ("`", "'"),
        # OntoNotes specific
        (" S$", " US$"),
        ("Showtime or a sister service", "Showtime or a service"),
        ("The hotel and gaming company", "The hotel and Gaming company"),
        ("I'm-coming-down-your-throat", "I-'m coming-down-your-throat"),
    )
    paragraphs = []
    with open(section_loc) as file_:
        para = []
        for line in file_:
            if line.startswith('['):
                line = line.split('|', 1)[1].strip()
                for find, replace in patches:
                    line = line.replace(find, replace)
                para.append(line)
            else:
                paragraphs.append(para)
                para = []
        paragraphs.append(para)
    return paragraphs
 def read_ptb_sec(ptb_sec_dir):
    ptb_sec_dir = Path(ptb_sec_dir)
    files = []
    for loc in ptb_sec_dir.iterdir():
        if not str(loc).endswith('parse') and not str(loc).endswith('mrg'):
            continue
        filename = loc.parts[-1].split('.')[0]
        with loc.open() as file_:
            text = file_.read()
        sents = []
        for parse_str in read_ptb.split(text):
            words, brackets = read_ptb.parse(parse_str, strip_bad_periods=True)
            words = [_reform_ptb_word(word) for word in words]
            string = ' '.join(words)
            sents.append((filename, string))
        files.append(sents)
    return files
 def _reform_ptb_word(tok):
    tok = tok.replace("``", '"')
    tok = tok.replace("`", "'")
    tok = tok.replace("''", '"')
    tok = tok.replace('\\', '')
    tok = tok.replace('-LCB-', '{')
    tok = tok.replace('-RCB-', '}')
    tok = tok.replace('-RRB-', ')')
    tok = tok.replace('-LRB-', '(')
    tok = tok.replace("'T-", "'T")
    return tok
 def get_alignment(raw_by_para, ptb_by_file):
    # These are list-of-lists, by paragraph and file respectively.
    # Flatten them into a list of (outer_id, inner_id, item) triples
    raw_sents = _flatten(raw_by_para)
    ptb_sents = list(_flatten(ptb_by_file))
    output = []
    ptb_idx = 0
    n_skipped = 0
    skips = []
    for (p_id, p_sent_id, raw) in raw_sents:
        if ptb_idx >= len(ptb_sents):
            n_skipped += 1
            continue
        f_id, f_sent_id, (ptb_id, ptb) = ptb_sents[ptb_idx]
        alignment = align_chars(raw, ptb)
        if not alignment:
            skips.append((ptb, raw))
            n_skipped += 1
            continue
        ptb_idx += 1
        sepped = []
        for i, c in enumerate(ptb):
            if alignment[i] is False:
                sepped.append('<SEP>')
            else:
                sepped.append(c)
        output.append((f_id, p_id, f_sent_id, (ptb_id, ''.join(sepped))))
    if n_skipped + len(ptb_sents) != len(raw_sents):
        for ptb, raw in skips:
            print(ptb)
            print(raw)
        raise Exception
    return output
 def _flatten(nested):
    flat = []
    for id1, inner in enumerate(nested):
        flat.extend((id1, id2, item) for id2, item in enumerate(inner))
    return flat
 def align_chars(raw, ptb):
    if raw.replace(' ', '') != ptb.replace(' ', ''):
        return None
    i = 0
    j = 0
    length = len(raw)
    alignment = [False for _ in range(len(ptb))]
    while i < length:
        if raw[i] == ' ' and ptb[j] == ' ':
            alignment[j] = True
            i += 1
            j += 1
        elif raw[i] == ' ':
            i += 1
        elif ptb[j] == ' ':
            j += 1
        assert raw[i].lower() == ptb[j].lower(), raw[i:1]
        alignment[j] = i
        i += 1; j += 1
    return alignment
 def group_into_files(sents):
    last_id = 0
    last_fn = None
    this = []
    output = []
    for f_id, p_id, s_id, (filename, sent) in sents:
        if f_id != last_id:
            assert last_fn is not None
            output.append((last_fn, this))
            this = []
        last_fn = filename
        this.append((f_id, p_id, s_id, sent))
        last_id = f_id
    if this:
        assert last_fn is not None
        output.append((last_fn, this))
    return output
 def group_into_paras(sents):
    last_id = 0
    this = []
    output = []
    for f_id, p_id, s_id, sent in sents:
        if p_id != last_id and this:
            output.append(this)
            this = []
        this.append(sent)
        last_id = p_id
    if this:
        output.append(this)
    return output
 def get_sections(odc_dir, ptb_dir, out_dir):
    for i in range(25):
        section = str(i) if i >= 10 else ('0' + str(i))
        odc_loc = path.join(odc_dir, 'wsj%s.txt' % section)
        ptb_sec = path.join(ptb_dir, section)
        out_loc = path.join(out_dir, 'wsj%s.json' % section)
        yield odc_loc, ptb_sec, out_loc
 def align_section(raw_paragraphs, ptb_files):
    aligned = get_alignment(raw_paragraphs, ptb_files)
    return [(fn, group_into_paras(sents))
            for fn, sents in group_into_files(aligned)]
 def do_wsj(odc_dir, ptb_dir, out_dir):
    for odc_loc, ptb_sec_dir, out_loc in get_sections(odc_dir, ptb_dir, out_dir):
        files = align_section(read_odc(odc_loc), read_ptb_sec(ptb_sec_dir))
        with open(out_loc, 'w') as file_:
            json.dump(files, file_)
 def do_web(src_dir, onto_dir, out_dir):
    mapping = dict(line.split() for line in open(path.join(onto_dir, 'map.txt'))
                   if len(line.split()) == 2)
    for annot_fn, src_fn in mapping.items():
        if not annot_fn.startswith('eng'):
            continue
        ptb_loc = path.join(onto_dir, annot_fn + '.parse') 
        src_loc = path.join(src_dir, src_fn + '.sgm')
        if path.exists(ptb_loc) and path.exists(src_loc):
            src_doc = sgml_extract(open(src_loc).read())
            ptb_doc = [read_ptb.parse(parse_str, strip_bad_periods=True)[0]
                       for parse_str in read_ptb.split(open(ptb_loc).read())]
            print('Found')
        else:
            print('Miss')
 def may_mkdir(parent, *subdirs):
    if not path.exists(parent):
        os.mkdir(parent)
    for i in range(1, len(subdirs)):
        directories = (parent,) + subdirs[:i]
        subdir = path.join(*directories)
        if not path.exists(subdir):
            os.mkdir(subdir)
 def main(odc_dir, onto_dir, out_dir):
    may_mkdir(out_dir, 'wsj', 'align')
    may_mkdir(out_dir, 'web', 'align')
    #do_wsj(odc_dir, path.join(ontonotes_dir, 'wsj', 'orig'),
    #       path.join(out_dir, 'wsj', 'align'))
    do_web(
        path.join(onto_dir, 'data', 'english', 'metadata', 'context', 'wb', 'sel'),
        path.join(onto_dir, 'data', 'english', 'annotations', 'wb'),
        path.join(out_dir, 'web', 'align'))
 if __name__ == '__main__':
    plac.call(main)
--- a/spacy/munge/read_conll.py
+++ b/spacy/munge/read_conll.py
@ -1,49 +0,0 @@
 from __future__ import unicode_literals
 def split(text):
    return [sent.strip() for sent in text.split('\n\n') if sent.strip()]
 def parse(sent_text, strip_bad_periods=False):
    sent_text = sent_text.strip()
    assert sent_text
    annot = []
    words = []
    id_map = {-1: -1}
    for i, line in enumerate(sent_text.split('\n')):
        word, tag, head, dep = _parse_line(line)
        if strip_bad_periods and words and _is_bad_period(words[-1], word):
            continue
        id_map[i] = len(words)
        annot.append({
            'id': len(words),
            'word': word,
            'tag': tag,
            'head': int(head) - 1,
            'dep': dep})
        words.append(word)
    for entry in annot:
        entry['head'] = id_map[entry['head']]
    return words, annot
 def _is_bad_period(prev, period):
    if period != '.':
        return False
    elif prev == '.':
        return False
    elif not prev.endswith('.'):
        return False
    else:
        return True
 def _parse_line(line):
    pieces = line.split()
    if len(pieces) == 4:
        return pieces
    else:
        return pieces[1], pieces[3], pieces[5], pieces[6]
--- a/spacy/munge/read_ner.py
+++ b/spacy/munge/read_ner.py
@ -1,116 +0,0 @@
 from __future__ import unicode_literals
 import os
 from os import path
 import re
 def split(text):
    """Split an annotation file by sentence. Each sentence's annotation should
    be a single string."""
    return text.strip().split('\n')[1:-1]
 def parse(string, strip_bad_periods=False):
    """Given a sentence's annotation string, return a list of word strings,
    and a list of named entities, where each entity is a (start, end, label)
    triple."""
    tokens = []
    tags = []
    open_tag = None
    # Arbitrary corrections to promote alignment, and ensure that entities
    # begin at a space. This allows us to treat entities as tokens, making it
    # easier to return the list of entities.
    string = string.replace('... .', '...')
    string = string.replace('U.S.</ENAMEX> .', 'U.S.</ENAMEX>')
    string = string.replace('Co.</ENAMEX> .', 'Co.</ENAMEX>')
    string = string.replace('U.S. .', 'U.S.')
    string = string.replace('<ENAMEX ', '<ENAMEX')
    string = string.replace(' E_OFF="', 'E_OFF="')
    string = string.replace(' S_OFF="', 'S_OFF="')
    string = string.replace('units</ENAMEX>-<ENAMEX', 'units</ENAMEX> - <ENAMEX')
    string = string.replace('<ENAMEXTYPE="PERSON"E_OFF="1">Paula</ENAMEX> Zahn', 'Paula Zahn')
    string = string.replace('<ENAMEXTYPE="CARDINAL"><ENAMEXTYPE="CARDINAL">little</ENAMEX> drain</ENAMEX>', 'little drain')
    for substr in string.strip().split():
        substr = _fix_inner_entities(substr)
        tokens.append(_get_text(substr))
        try:
            tag, open_tag = _get_tag(substr, open_tag)
        except:
            raise
        tags.append(tag)
    return tokens, tags
 tag_re = re.compile(r'<ENAMEXTYPE="[^"]+">')
 def _fix_inner_entities(substr):
    tags = tag_re.findall(substr)
    if '</ENAMEX' in substr and not substr.endswith('</ENAMEX'):
            substr = substr.replace('</ENAMEX>', '') + '</ENAMEX>'
    if tags:
        substr = tag_re.sub('', substr)
        return tags[0] + substr
    else:
        return substr
 def _get_tag(substr, tag):
    if substr.startswith('<'):
        tag = substr.split('"')[1]
        if substr.endswith('>'):
            return 'U-' + tag, None
        else:
            return 'B-%s' % tag, tag
    elif substr.endswith('>'):
        return 'L-' + tag, None
    elif tag is not None:
        return 'I-' + tag, tag
    else:
        return 'O', None
 def _get_text(substr):
    if substr.startswith('<'):
        substr = substr.split('>', 1)[1]
    if substr.endswith('>'):
        substr = substr.split('<')[0]
    return reform_string(substr)
 def tags_to_entities(tags):
    entities = []
    start = None
    for i, tag in enumerate(tags):
        if tag.startswith('O'):
            # TODO: We shouldn't be getting these malformed inputs. Fix this.
            if start is not None:
                start = None
            continue
        elif tag == '-':
            continue
        elif tag.startswith('I'):
            assert start is not None, tags[:i]
            continue
        if tag.startswith('U'):
            entities.append((tag[2:], i, i))
        elif tag.startswith('B'):
            start = i
        elif tag.startswith('L'):
            entities.append((tag[2:], start, i))
            start = None
        else:
            raise Exception(tag)
    return entities
 def reform_string(tok):
    tok = tok.replace("``", '"')
    tok = tok.replace("`", "'")
    tok = tok.replace("''", '"')
    tok = tok.replace('\\', '')
    tok = tok.replace('-LCB-', '{')
    tok = tok.replace('-RCB-', '}')
    tok = tok.replace('-RRB-', ')')
    tok = tok.replace('-LRB-', '(')
    tok = tok.replace("'T-", "'T")
    tok = tok.replace('-AMP-', '&')
    return tok
--- a/spacy/munge/read_ontonotes.py
+++ b/spacy/munge/read_ontonotes.py
@ -1,47 +0,0 @@
 import re
 docid_re = re.compile(r'<DOCID>([^>]+)</DOCID>')
 doctype_re = re.compile(r'<DOCTYPE SOURCE="[^"]+">([^>]+)</DOCTYPE>')
 datetime_re = re.compile(r'<DATETIME>([^>]+)</DATETIME>')
 headline_re = re.compile(r'<HEADLINE>(.+)</HEADLINE>', re.DOTALL)
 post_re = re.compile(r'<POST>(.+)</POST>', re.DOTALL)
 poster_re = re.compile(r'<POSTER>(.+)</POSTER>')
 postdate_re = re.compile(r'<POSTDATE>(.+)</POSTDATE>')
 tag_re = re.compile(r'<[^>]+>[^>]+</[^>]+>')
 def sgml_extract(text_data):
    """Extract text from the OntoNotes web documents.
    Format:
    [{
        docid: string,
        doctype: string,
        datetime: string,
        poster: string,
        postdate: string
        text: [string]
    }]
    """
    return {
        'docid': _get_one(docid_re, text_data, required=True),
        'doctype': _get_one(doctype_re, text_data, required=True),
        'datetime': _get_one(datetime_re, text_data, required=True),
        'headline': _get_one(headline_re, text_data, required=True),
        'poster': _get_one(poster_re, _get_one(post_re, text_data)),
        'postdate': _get_one(postdate_re, _get_one(post_re, text_data)),
        'text': _get_text(_get_one(post_re, text_data)).strip()
    }
 def _get_one(regex, text, required=False):
    matches = regex.search(text)
    if not matches and not required:
        return ''
    assert len(matches.groups()) == 1, matches
    return matches.groups()[0].strip()
 def _get_text(data):
    return tag_re.sub('', data).replace('<P>', '').replace('</P>', '')
--- a/spacy/munge/read_ptb.py
+++ b/spacy/munge/read_ptb.py
@ -1,65 +0,0 @@
 import re
 import os
 from os import path
 def parse(sent_text, strip_bad_periods=False):
    sent_text = sent_text.strip()
    assert sent_text and sent_text.startswith('(')
    open_brackets = []
    brackets = []
    bracketsRE = re.compile(r'(\()([^\s\)\(]+)|([^\s\)\(]+)?(\))')
    word_i = 0
    words = []
    # Remove outermost bracket
    if sent_text.startswith('(('):
        sent_text = sent_text.replace('((', '( (', 1)
    for match in bracketsRE.finditer(sent_text[2:-1]):
        open_, label, text, close = match.groups()
        if open_:
            assert not close
            assert label.strip()
            open_brackets.append((label, word_i))
        else:
            assert close
            label, start = open_brackets.pop()
            assert label.strip()
            if strip_bad_periods and words and _is_bad_period(words[-1], text):
                continue
            # Traces leave 0-width bracket, but no token
            if text and label != '-NONE-':
                words.append(text)
                word_i += 1
            else:
                brackets.append((label, start, word_i))
    return words, brackets
 def _is_bad_period(prev, period):
    if period != '.':
        return False
    elif prev == '.':
        return False
    elif not prev.endswith('.'):
        return False
    else:
        return True
 def split(text):
    sentences = []
    current = []
    for line in text.strip().split('\n'):
        line = line.rstrip()
        if not line:
            continue
        # Detect the start of sentences by line starting with (
        # This is messy, but it keeps bracket parsing at the sentence level
        if line.startswith('(') and current:
            sentences.append('\n'.join(current))
            current = []
        current.append(line)
    if current:
        sentences.append('\n'.join(current))
    return sentences