Remove unused munge package

2017-05-09 00:16:16 +02:00 · 2017-05-09 00:16:16 +02:00 · 7b83977020
parent c714841cc8
commit 7b83977020
6 changed files with 0 additions and 519 deletions
--- a/spacy/munge/init.py
+++ b/spacy/munge/init.py
--- a/spacy/munge/align_raw.py
+++ b/spacy/munge/align_raw.py
@ -1,242 +0,0 @@
-"""Align the raw sentences from Read et al (2012) to the PTB tokenization,
-outputting as a .json file. Used in bin/prepare_treebank.py
-"""
-from __future__ import unicode_literals
-
-import plac
-from pathlib import Path
-import json
-from os import path
-import os
-
-from spacy.munge import read_ptb
-from spacy.munge.read_ontonotes import sgml_extract
-
-
-def read_odc(section_loc):
-    # Arbitrary patches applied to the _raw_ text to promote alignment.
-    patches = (
-        ('. . . .', '...'),
-        ('....', '...'),
-        ('Co..', 'Co.'),
-        ("`", "'"),
-        # OntoNotes specific
-        (" S$", " US$"),
-        ("Showtime or a sister service", "Showtime or a service"),
-        ("The hotel and gaming company", "The hotel and Gaming company"),
-        ("I'm-coming-down-your-throat", "I-'m coming-down-your-throat"),
-    )
-    
-    paragraphs = []
-    with open(section_loc) as file_:
-        para = []
-        for line in file_:
-            if line.startswith('['):
-                line = line.split('|', 1)[1].strip()
-                for find, replace in patches:
-                    line = line.replace(find, replace)
-                para.append(line)
-            else:
-                paragraphs.append(para)
-                para = []
-        paragraphs.append(para)
-    return paragraphs
-
-
-def read_ptb_sec(ptb_sec_dir):
-    ptb_sec_dir = Path(ptb_sec_dir)
-    files = []
-    for loc in ptb_sec_dir.iterdir():
-        if not str(loc).endswith('parse') and not str(loc).endswith('mrg'):
-            continue
-        filename = loc.parts[-1].split('.')[0]
-        with loc.open() as file_:
-            text = file_.read()
-        sents = []
-        for parse_str in read_ptb.split(text):
-            words, brackets = read_ptb.parse(parse_str, strip_bad_periods=True)
-            words = [_reform_ptb_word(word) for word in words]
-            string = ' '.join(words)
-            sents.append((filename, string))
-        files.append(sents)
-    return files
-
-
-def _reform_ptb_word(tok):
-    tok = tok.replace("``", '"')
-    tok = tok.replace("`", "'")
-    tok = tok.replace("''", '"')
-    tok = tok.replace('\\', '')
-    tok = tok.replace('-LCB-', '{')
-    tok = tok.replace('-RCB-', '}')
-    tok = tok.replace('-RRB-', ')')
-    tok = tok.replace('-LRB-', '(')
-    tok = tok.replace("'T-", "'T")
-    return tok
- 
-
-def get_alignment(raw_by_para, ptb_by_file):
-    # These are list-of-lists, by paragraph and file respectively.
-    # Flatten them into a list of (outer_id, inner_id, item) triples
-    raw_sents = _flatten(raw_by_para)
-    ptb_sents = list(_flatten(ptb_by_file))
-
-    output = []
-    ptb_idx = 0
-    n_skipped = 0
-    skips = []
-    for (p_id, p_sent_id, raw) in raw_sents:
-        if ptb_idx >= len(ptb_sents):
-            n_skipped += 1
-            continue
-        f_id, f_sent_id, (ptb_id, ptb) = ptb_sents[ptb_idx]
-        alignment = align_chars(raw, ptb)
-        if not alignment:
-            skips.append((ptb, raw))
-            n_skipped += 1
-            continue
-        ptb_idx += 1
-        sepped = []
-        for i, c in enumerate(ptb):
-            if alignment[i] is False:
-                sepped.append('<SEP>')
-            else:
-                sepped.append(c)
-        output.append((f_id, p_id, f_sent_id, (ptb_id, ''.join(sepped))))
-    if n_skipped + len(ptb_sents) != len(raw_sents):
-        for ptb, raw in skips:
-            print(ptb)
-            print(raw)
-        raise Exception
-    return output
-
-
-def _flatten(nested):
-    flat = []
-    for id1, inner in enumerate(nested):
-        flat.extend((id1, id2, item) for id2, item in enumerate(inner))
-    return flat
-
-
-def align_chars(raw, ptb):
-    if raw.replace(' ', '') != ptb.replace(' ', ''):
-        return None
-    i = 0
-    j = 0
-
-    length = len(raw)
-    alignment = [False for _ in range(len(ptb))]
-    while i < length:
-        if raw[i] == ' ' and ptb[j] == ' ':
-            alignment[j] = True
-            i += 1
-            j += 1
-        elif raw[i] == ' ':
-            i += 1
-        elif ptb[j] == ' ':
-            j += 1
-        assert raw[i].lower() == ptb[j].lower(), raw[i:1]
-        alignment[j] = i
-        i += 1; j += 1
-    return alignment
-
-
-def group_into_files(sents):
-    last_id = 0
-    last_fn = None
-    this = []
-    output = []
-    for f_id, p_id, s_id, (filename, sent) in sents:
-        if f_id != last_id:
-            assert last_fn is not None
-            output.append((last_fn, this))
-            this = []
-        last_fn = filename
-        this.append((f_id, p_id, s_id, sent))
-        last_id = f_id
-    if this:
-        assert last_fn is not None
-        output.append((last_fn, this))
-    return output
-
-
-def group_into_paras(sents):
-    last_id = 0
-    this = []
-    output = []
-    for f_id, p_id, s_id, sent in sents:
-        if p_id != last_id and this:
-            output.append(this)
-            this = []
-        this.append(sent)
-        last_id = p_id
-    if this:
-        output.append(this)
-    return output
-
-
-def get_sections(odc_dir, ptb_dir, out_dir):
-    for i in range(25):
-        section = str(i) if i >= 10 else ('0' + str(i))
-        odc_loc = path.join(odc_dir, 'wsj%s.txt' % section)
-        ptb_sec = path.join(ptb_dir, section)
-        out_loc = path.join(out_dir, 'wsj%s.json' % section)
-        yield odc_loc, ptb_sec, out_loc
-
-
-def align_section(raw_paragraphs, ptb_files):
-    aligned = get_alignment(raw_paragraphs, ptb_files)
-    return [(fn, group_into_paras(sents))
-            for fn, sents in group_into_files(aligned)]
-
-
-def do_wsj(odc_dir, ptb_dir, out_dir):
-    for odc_loc, ptb_sec_dir, out_loc in get_sections(odc_dir, ptb_dir, out_dir):
-        files = align_section(read_odc(odc_loc), read_ptb_sec(ptb_sec_dir))
-        with open(out_loc, 'w') as file_:
-            json.dump(files, file_)
-
-
-def do_web(src_dir, onto_dir, out_dir):
-    mapping = dict(line.split() for line in open(path.join(onto_dir, 'map.txt'))
-                   if len(line.split()) == 2)
-    for annot_fn, src_fn in mapping.items():
-        if not annot_fn.startswith('eng'):
-            continue
-
-        ptb_loc = path.join(onto_dir, annot_fn + '.parse') 
-        src_loc = path.join(src_dir, src_fn + '.sgm')
-
-        if path.exists(ptb_loc) and path.exists(src_loc):
-            src_doc = sgml_extract(open(src_loc).read())
-            ptb_doc = [read_ptb.parse(parse_str, strip_bad_periods=True)[0]
-                       for parse_str in read_ptb.split(open(ptb_loc).read())]
-            print('Found')
-        else:
-            print('Miss')
-
-
-def may_mkdir(parent, *subdirs):
-    if not path.exists(parent):
-        os.mkdir(parent)
-    for i in range(1, len(subdirs)):
-        directories = (parent,) + subdirs[:i]
-        subdir = path.join(*directories)
-        if not path.exists(subdir):
-            os.mkdir(subdir)
-
-
-def main(odc_dir, onto_dir, out_dir):
-    may_mkdir(out_dir, 'wsj', 'align')
-    may_mkdir(out_dir, 'web', 'align')
-    #do_wsj(odc_dir, path.join(ontonotes_dir, 'wsj', 'orig'),
-    #       path.join(out_dir, 'wsj', 'align'))
-    do_web(
-        path.join(onto_dir, 'data', 'english', 'metadata', 'context', 'wb', 'sel'),
-        path.join(onto_dir, 'data', 'english', 'annotations', 'wb'),
-        path.join(out_dir, 'web', 'align'))
-
-
-
-if __name__ == '__main__':
-    plac.call(main)
--- a/spacy/munge/read_conll.py
+++ b/spacy/munge/read_conll.py
@ -1,49 +0,0 @@
-from __future__ import unicode_literals
-
-
-def split(text):
-    return [sent.strip() for sent in text.split('\n\n') if sent.strip()]
-
-
-def parse(sent_text, strip_bad_periods=False):
-    sent_text = sent_text.strip()
-    assert sent_text
-    annot = []
-    words = []
-    id_map = {-1: -1}
-    for i, line in enumerate(sent_text.split('\n')):
-        word, tag, head, dep = _parse_line(line)
-        if strip_bad_periods and words and _is_bad_period(words[-1], word):
-            continue
-        id_map[i] = len(words)
-  
-        annot.append({
-            'id': len(words),
-            'word': word,
-            'tag': tag,
-            'head': int(head) - 1,
-            'dep': dep})
-        words.append(word)
-    for entry in annot:
-        entry['head'] = id_map[entry['head']]
-    return words, annot
-
-
-def _is_bad_period(prev, period):
-    if period != '.':
-        return False
-    elif prev == '.':
-        return False
-    elif not prev.endswith('.'):
-        return False
-    else:
-        return True
-
-
-def _parse_line(line):
-    pieces = line.split()
-    if len(pieces) == 4:
-        return pieces
-    else:
-        return pieces[1], pieces[3], pieces[5], pieces[6]
-
--- a/spacy/munge/read_ner.py
+++ b/spacy/munge/read_ner.py
@ -1,116 +0,0 @@
-from __future__ import unicode_literals
-import os
-from os import path
-import re
-
-
-def split(text):
-    """Split an annotation file by sentence. Each sentence's annotation should
-    be a single string."""
-    return text.strip().split('\n')[1:-1]
-    
-
-def parse(string, strip_bad_periods=False):
-    """Given a sentence's annotation string, return a list of word strings,
-    and a list of named entities, where each entity is a (start, end, label)
-    triple."""
-    tokens = []
-    tags = []
-    open_tag = None
-    # Arbitrary corrections to promote alignment, and ensure that entities
-    # begin at a space. This allows us to treat entities as tokens, making it
-    # easier to return the list of entities.
-    string = string.replace('... .', '...')
-    string = string.replace('U.S.</ENAMEX> .', 'U.S.</ENAMEX>')
-    string = string.replace('Co.</ENAMEX> .', 'Co.</ENAMEX>')
-    string = string.replace('U.S. .', 'U.S.')
-    string = string.replace('<ENAMEX ', '<ENAMEX')
-    string = string.replace(' E_OFF="', 'E_OFF="')
-    string = string.replace(' S_OFF="', 'S_OFF="')
-    string = string.replace('units</ENAMEX>-<ENAMEX', 'units</ENAMEX> - <ENAMEX')
-    string = string.replace('<ENAMEXTYPE="PERSON"E_OFF="1">Paula</ENAMEX> Zahn', 'Paula Zahn')
-    string = string.replace('<ENAMEXTYPE="CARDINAL"><ENAMEXTYPE="CARDINAL">little</ENAMEX> drain</ENAMEX>', 'little drain')
-    for substr in string.strip().split():
-        substr = _fix_inner_entities(substr)
-        tokens.append(_get_text(substr))
-        try:
-            tag, open_tag = _get_tag(substr, open_tag)
-        except:
-            raise
-        tags.append(tag)
-    return tokens, tags
-
-
-tag_re = re.compile(r'<ENAMEXTYPE="[^"]+">')
-def _fix_inner_entities(substr):
-    tags = tag_re.findall(substr)
-    if '</ENAMEX' in substr and not substr.endswith('</ENAMEX'):
-            substr = substr.replace('</ENAMEX>', '') + '</ENAMEX>'
-    if tags:
-        substr = tag_re.sub('', substr)
-        return tags[0] + substr
-    else:
-        return substr
-
-
-def _get_tag(substr, tag):
-    if substr.startswith('<'):
-        tag = substr.split('"')[1]
-        if substr.endswith('>'):
-            return 'U-' + tag, None
-        else:
-            return 'B-%s' % tag, tag
-    elif substr.endswith('>'):
-        return 'L-' + tag, None
-    elif tag is not None:
-        return 'I-' + tag, tag
-    else:
-        return 'O', None
-
-
-def _get_text(substr):
-    if substr.startswith('<'):
-        substr = substr.split('>', 1)[1]
-    if substr.endswith('>'):
-        substr = substr.split('<')[0]
-    return reform_string(substr)
-
-
-def tags_to_entities(tags):
-    entities = []
-    start = None
-    for i, tag in enumerate(tags):
-        if tag.startswith('O'):
-            # TODO: We shouldn't be getting these malformed inputs. Fix this.
-            if start is not None:
-                start = None
-            continue
-        elif tag == '-':
-            continue
-        elif tag.startswith('I'):
-            assert start is not None, tags[:i]
-            continue
-        if tag.startswith('U'):
-            entities.append((tag[2:], i, i))
-        elif tag.startswith('B'):
-            start = i
-        elif tag.startswith('L'):
-            entities.append((tag[2:], start, i))
-            start = None
-        else:
-            raise Exception(tag)
-    return entities
-
-
-def reform_string(tok):
-    tok = tok.replace("``", '"')
-    tok = tok.replace("`", "'")
-    tok = tok.replace("''", '"')
-    tok = tok.replace('\\', '')
-    tok = tok.replace('-LCB-', '{')
-    tok = tok.replace('-RCB-', '}')
-    tok = tok.replace('-RRB-', ')')
-    tok = tok.replace('-LRB-', '(')
-    tok = tok.replace("'T-", "'T")
-    tok = tok.replace('-AMP-', '&')
-    return tok
--- a/spacy/munge/read_ontonotes.py
+++ b/spacy/munge/read_ontonotes.py
@ -1,47 +0,0 @@
-import re
-
-
-docid_re = re.compile(r'<DOCID>([^>]+)</DOCID>')
-doctype_re = re.compile(r'<DOCTYPE SOURCE="[^"]+">([^>]+)</DOCTYPE>')
-datetime_re = re.compile(r'<DATETIME>([^>]+)</DATETIME>')
-headline_re = re.compile(r'<HEADLINE>(.+)</HEADLINE>', re.DOTALL)
-post_re = re.compile(r'<POST>(.+)</POST>', re.DOTALL)
-poster_re = re.compile(r'<POSTER>(.+)</POSTER>')
-postdate_re = re.compile(r'<POSTDATE>(.+)</POSTDATE>')
-tag_re = re.compile(r'<[^>]+>[^>]+</[^>]+>')
-
-
-def sgml_extract(text_data):
-    """Extract text from the OntoNotes web documents.
-
-    Format:
-    [{
-        docid: string,
-        doctype: string,
-        datetime: string,
-        poster: string,
-        postdate: string
-        text: [string]
-    }]
-    """
-    return {
-        'docid': _get_one(docid_re, text_data, required=True),
-        'doctype': _get_one(doctype_re, text_data, required=True),
-        'datetime': _get_one(datetime_re, text_data, required=True),
-        'headline': _get_one(headline_re, text_data, required=True),
-        'poster': _get_one(poster_re, _get_one(post_re, text_data)),
-        'postdate': _get_one(postdate_re, _get_one(post_re, text_data)),
-        'text': _get_text(_get_one(post_re, text_data)).strip()
-    }
-
-
-def _get_one(regex, text, required=False):
-    matches = regex.search(text)
-    if not matches and not required:
-        return ''
-    assert len(matches.groups()) == 1, matches
-    return matches.groups()[0].strip()
-
-
-def _get_text(data):
-    return tag_re.sub('', data).replace('<P>', '').replace('</P>', '')
--- a/spacy/munge/read_ptb.py
+++ b/spacy/munge/read_ptb.py
@ -1,65 +0,0 @@
-import re
-import os
-from os import path
-
-
-def parse(sent_text, strip_bad_periods=False):
-    sent_text = sent_text.strip()
-    assert sent_text and sent_text.startswith('(')
-    open_brackets = []
-    brackets = []
-    bracketsRE = re.compile(r'(\()([^\s\)\(]+)|([^\s\)\(]+)?(\))')
-    word_i = 0
-    words = []
-    # Remove outermost bracket
-    if sent_text.startswith('(('):
-        sent_text = sent_text.replace('((', '( (', 1)
-    for match in bracketsRE.finditer(sent_text[2:-1]):
-        open_, label, text, close = match.groups()
-        if open_:
-            assert not close
-            assert label.strip()
-            open_brackets.append((label, word_i))
-        else:
-            assert close
-            label, start = open_brackets.pop()
-            assert label.strip()
-            if strip_bad_periods and words and _is_bad_period(words[-1], text):
-                continue
-            # Traces leave 0-width bracket, but no token
-            if text and label != '-NONE-':
-                words.append(text)
-                word_i += 1
-            else:
-                brackets.append((label, start, word_i))
-    return words, brackets
-
-
-def _is_bad_period(prev, period):
-    if period != '.':
-        return False
-    elif prev == '.':
-        return False
-    elif not prev.endswith('.'):
-        return False
-    else:
-        return True
-
-
-def split(text):
-    sentences = []
-    current = []
-
-    for line in text.strip().split('\n'):
-        line = line.rstrip()
-        if not line:
-            continue
-        # Detect the start of sentences by line starting with (
-        # This is messy, but it keeps bracket parsing at the sentence level
-        if line.startswith('(') and current:
-            sentences.append('\n'.join(current))
-            current = []
-        current.append(line)
-    if current:
-        sentences.append('\n'.join(current))
-    return sentences