From 7b83977020d8ede234620c1727d606c0bc230745 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 9 May 2017 00:16:16 +0200 Subject: [PATCH] Remove unused munge package --- spacy/munge/__init__.py | 0 spacy/munge/align_raw.py | 242 ---------------------------------- spacy/munge/read_conll.py | 49 ------- spacy/munge/read_ner.py | 116 ---------------- spacy/munge/read_ontonotes.py | 47 ------- spacy/munge/read_ptb.py | 65 --------- 6 files changed, 519 deletions(-) delete mode 100644 spacy/munge/__init__.py delete mode 100644 spacy/munge/align_raw.py delete mode 100644 spacy/munge/read_conll.py delete mode 100644 spacy/munge/read_ner.py delete mode 100644 spacy/munge/read_ontonotes.py delete mode 100644 spacy/munge/read_ptb.py diff --git a/spacy/munge/__init__.py b/spacy/munge/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/munge/align_raw.py b/spacy/munge/align_raw.py deleted file mode 100644 index 6bdb91abf..000000000 --- a/spacy/munge/align_raw.py +++ /dev/null @@ -1,242 +0,0 @@ -"""Align the raw sentences from Read et al (2012) to the PTB tokenization, -outputting as a .json file. Used in bin/prepare_treebank.py -""" -from __future__ import unicode_literals - -import plac -from pathlib import Path -import json -from os import path -import os - -from spacy.munge import read_ptb -from spacy.munge.read_ontonotes import sgml_extract - - -def read_odc(section_loc): - # Arbitrary patches applied to the _raw_ text to promote alignment. - patches = ( - ('. . . .', '...'), - ('....', '...'), - ('Co..', 'Co.'), - ("`", "'"), - # OntoNotes specific - (" S$", " US$"), - ("Showtime or a sister service", "Showtime or a service"), - ("The hotel and gaming company", "The hotel and Gaming company"), - ("I'm-coming-down-your-throat", "I-'m coming-down-your-throat"), - ) - - paragraphs = [] - with open(section_loc) as file_: - para = [] - for line in file_: - if line.startswith('['): - line = line.split('|', 1)[1].strip() - for find, replace in patches: - line = line.replace(find, replace) - para.append(line) - else: - paragraphs.append(para) - para = [] - paragraphs.append(para) - return paragraphs - - -def read_ptb_sec(ptb_sec_dir): - ptb_sec_dir = Path(ptb_sec_dir) - files = [] - for loc in ptb_sec_dir.iterdir(): - if not str(loc).endswith('parse') and not str(loc).endswith('mrg'): - continue - filename = loc.parts[-1].split('.')[0] - with loc.open() as file_: - text = file_.read() - sents = [] - for parse_str in read_ptb.split(text): - words, brackets = read_ptb.parse(parse_str, strip_bad_periods=True) - words = [_reform_ptb_word(word) for word in words] - string = ' '.join(words) - sents.append((filename, string)) - files.append(sents) - return files - - -def _reform_ptb_word(tok): - tok = tok.replace("``", '"') - tok = tok.replace("`", "'") - tok = tok.replace("''", '"') - tok = tok.replace('\\', '') - tok = tok.replace('-LCB-', '{') - tok = tok.replace('-RCB-', '}') - tok = tok.replace('-RRB-', ')') - tok = tok.replace('-LRB-', '(') - tok = tok.replace("'T-", "'T") - return tok - - -def get_alignment(raw_by_para, ptb_by_file): - # These are list-of-lists, by paragraph and file respectively. - # Flatten them into a list of (outer_id, inner_id, item) triples - raw_sents = _flatten(raw_by_para) - ptb_sents = list(_flatten(ptb_by_file)) - - output = [] - ptb_idx = 0 - n_skipped = 0 - skips = [] - for (p_id, p_sent_id, raw) in raw_sents: - if ptb_idx >= len(ptb_sents): - n_skipped += 1 - continue - f_id, f_sent_id, (ptb_id, ptb) = ptb_sents[ptb_idx] - alignment = align_chars(raw, ptb) - if not alignment: - skips.append((ptb, raw)) - n_skipped += 1 - continue - ptb_idx += 1 - sepped = [] - for i, c in enumerate(ptb): - if alignment[i] is False: - sepped.append('') - else: - sepped.append(c) - output.append((f_id, p_id, f_sent_id, (ptb_id, ''.join(sepped)))) - if n_skipped + len(ptb_sents) != len(raw_sents): - for ptb, raw in skips: - print(ptb) - print(raw) - raise Exception - return output - - -def _flatten(nested): - flat = [] - for id1, inner in enumerate(nested): - flat.extend((id1, id2, item) for id2, item in enumerate(inner)) - return flat - - -def align_chars(raw, ptb): - if raw.replace(' ', '') != ptb.replace(' ', ''): - return None - i = 0 - j = 0 - - length = len(raw) - alignment = [False for _ in range(len(ptb))] - while i < length: - if raw[i] == ' ' and ptb[j] == ' ': - alignment[j] = True - i += 1 - j += 1 - elif raw[i] == ' ': - i += 1 - elif ptb[j] == ' ': - j += 1 - assert raw[i].lower() == ptb[j].lower(), raw[i:1] - alignment[j] = i - i += 1; j += 1 - return alignment - - -def group_into_files(sents): - last_id = 0 - last_fn = None - this = [] - output = [] - for f_id, p_id, s_id, (filename, sent) in sents: - if f_id != last_id: - assert last_fn is not None - output.append((last_fn, this)) - this = [] - last_fn = filename - this.append((f_id, p_id, s_id, sent)) - last_id = f_id - if this: - assert last_fn is not None - output.append((last_fn, this)) - return output - - -def group_into_paras(sents): - last_id = 0 - this = [] - output = [] - for f_id, p_id, s_id, sent in sents: - if p_id != last_id and this: - output.append(this) - this = [] - this.append(sent) - last_id = p_id - if this: - output.append(this) - return output - - -def get_sections(odc_dir, ptb_dir, out_dir): - for i in range(25): - section = str(i) if i >= 10 else ('0' + str(i)) - odc_loc = path.join(odc_dir, 'wsj%s.txt' % section) - ptb_sec = path.join(ptb_dir, section) - out_loc = path.join(out_dir, 'wsj%s.json' % section) - yield odc_loc, ptb_sec, out_loc - - -def align_section(raw_paragraphs, ptb_files): - aligned = get_alignment(raw_paragraphs, ptb_files) - return [(fn, group_into_paras(sents)) - for fn, sents in group_into_files(aligned)] - - -def do_wsj(odc_dir, ptb_dir, out_dir): - for odc_loc, ptb_sec_dir, out_loc in get_sections(odc_dir, ptb_dir, out_dir): - files = align_section(read_odc(odc_loc), read_ptb_sec(ptb_sec_dir)) - with open(out_loc, 'w') as file_: - json.dump(files, file_) - - -def do_web(src_dir, onto_dir, out_dir): - mapping = dict(line.split() for line in open(path.join(onto_dir, 'map.txt')) - if len(line.split()) == 2) - for annot_fn, src_fn in mapping.items(): - if not annot_fn.startswith('eng'): - continue - - ptb_loc = path.join(onto_dir, annot_fn + '.parse') - src_loc = path.join(src_dir, src_fn + '.sgm') - - if path.exists(ptb_loc) and path.exists(src_loc): - src_doc = sgml_extract(open(src_loc).read()) - ptb_doc = [read_ptb.parse(parse_str, strip_bad_periods=True)[0] - for parse_str in read_ptb.split(open(ptb_loc).read())] - print('Found') - else: - print('Miss') - - -def may_mkdir(parent, *subdirs): - if not path.exists(parent): - os.mkdir(parent) - for i in range(1, len(subdirs)): - directories = (parent,) + subdirs[:i] - subdir = path.join(*directories) - if not path.exists(subdir): - os.mkdir(subdir) - - -def main(odc_dir, onto_dir, out_dir): - may_mkdir(out_dir, 'wsj', 'align') - may_mkdir(out_dir, 'web', 'align') - #do_wsj(odc_dir, path.join(ontonotes_dir, 'wsj', 'orig'), - # path.join(out_dir, 'wsj', 'align')) - do_web( - path.join(onto_dir, 'data', 'english', 'metadata', 'context', 'wb', 'sel'), - path.join(onto_dir, 'data', 'english', 'annotations', 'wb'), - path.join(out_dir, 'web', 'align')) - - - -if __name__ == '__main__': - plac.call(main) diff --git a/spacy/munge/read_conll.py b/spacy/munge/read_conll.py deleted file mode 100644 index a120ea497..000000000 --- a/spacy/munge/read_conll.py +++ /dev/null @@ -1,49 +0,0 @@ -from __future__ import unicode_literals - - -def split(text): - return [sent.strip() for sent in text.split('\n\n') if sent.strip()] - - -def parse(sent_text, strip_bad_periods=False): - sent_text = sent_text.strip() - assert sent_text - annot = [] - words = [] - id_map = {-1: -1} - for i, line in enumerate(sent_text.split('\n')): - word, tag, head, dep = _parse_line(line) - if strip_bad_periods and words and _is_bad_period(words[-1], word): - continue - id_map[i] = len(words) - - annot.append({ - 'id': len(words), - 'word': word, - 'tag': tag, - 'head': int(head) - 1, - 'dep': dep}) - words.append(word) - for entry in annot: - entry['head'] = id_map[entry['head']] - return words, annot - - -def _is_bad_period(prev, period): - if period != '.': - return False - elif prev == '.': - return False - elif not prev.endswith('.'): - return False - else: - return True - - -def _parse_line(line): - pieces = line.split() - if len(pieces) == 4: - return pieces - else: - return pieces[1], pieces[3], pieces[5], pieces[6] - diff --git a/spacy/munge/read_ner.py b/spacy/munge/read_ner.py deleted file mode 100644 index 23b3f490f..000000000 --- a/spacy/munge/read_ner.py +++ /dev/null @@ -1,116 +0,0 @@ -from __future__ import unicode_literals -import os -from os import path -import re - - -def split(text): - """Split an annotation file by sentence. Each sentence's annotation should - be a single string.""" - return text.strip().split('\n')[1:-1] - - -def parse(string, strip_bad_periods=False): - """Given a sentence's annotation string, return a list of word strings, - and a list of named entities, where each entity is a (start, end, label) - triple.""" - tokens = [] - tags = [] - open_tag = None - # Arbitrary corrections to promote alignment, and ensure that entities - # begin at a space. This allows us to treat entities as tokens, making it - # easier to return the list of entities. - string = string.replace('... .', '...') - string = string.replace('U.S. .', 'U.S.') - string = string.replace('Co. .', 'Co.') - string = string.replace('U.S. .', 'U.S.') - string = string.replace('- - Paula Zahn', 'Paula Zahn') - string = string.replace('little drain', 'little drain') - for substr in string.strip().split(): - substr = _fix_inner_entities(substr) - tokens.append(_get_text(substr)) - try: - tag, open_tag = _get_tag(substr, open_tag) - except: - raise - tags.append(tag) - return tokens, tags - - -tag_re = re.compile(r'') -def _fix_inner_entities(substr): - tags = tag_re.findall(substr) - if '', '') + '' - if tags: - substr = tag_re.sub('', substr) - return tags[0] + substr - else: - return substr - - -def _get_tag(substr, tag): - if substr.startswith('<'): - tag = substr.split('"')[1] - if substr.endswith('>'): - return 'U-' + tag, None - else: - return 'B-%s' % tag, tag - elif substr.endswith('>'): - return 'L-' + tag, None - elif tag is not None: - return 'I-' + tag, tag - else: - return 'O', None - - -def _get_text(substr): - if substr.startswith('<'): - substr = substr.split('>', 1)[1] - if substr.endswith('>'): - substr = substr.split('<')[0] - return reform_string(substr) - - -def tags_to_entities(tags): - entities = [] - start = None - for i, tag in enumerate(tags): - if tag.startswith('O'): - # TODO: We shouldn't be getting these malformed inputs. Fix this. - if start is not None: - start = None - continue - elif tag == '-': - continue - elif tag.startswith('I'): - assert start is not None, tags[:i] - continue - if tag.startswith('U'): - entities.append((tag[2:], i, i)) - elif tag.startswith('B'): - start = i - elif tag.startswith('L'): - entities.append((tag[2:], start, i)) - start = None - else: - raise Exception(tag) - return entities - - -def reform_string(tok): - tok = tok.replace("``", '"') - tok = tok.replace("`", "'") - tok = tok.replace("''", '"') - tok = tok.replace('\\', '') - tok = tok.replace('-LCB-', '{') - tok = tok.replace('-RCB-', '}') - tok = tok.replace('-RRB-', ')') - tok = tok.replace('-LRB-', '(') - tok = tok.replace("'T-", "'T") - tok = tok.replace('-AMP-', '&') - return tok diff --git a/spacy/munge/read_ontonotes.py b/spacy/munge/read_ontonotes.py deleted file mode 100644 index 38c3c780e..000000000 --- a/spacy/munge/read_ontonotes.py +++ /dev/null @@ -1,47 +0,0 @@ -import re - - -docid_re = re.compile(r'([^>]+)') -doctype_re = re.compile(r'([^>]+)') -datetime_re = re.compile(r'([^>]+)') -headline_re = re.compile(r'(.+)', re.DOTALL) -post_re = re.compile(r'(.+)', re.DOTALL) -poster_re = re.compile(r'(.+)') -postdate_re = re.compile(r'(.+)') -tag_re = re.compile(r'<[^>]+>[^>]+]+>') - - -def sgml_extract(text_data): - """Extract text from the OntoNotes web documents. - - Format: - [{ - docid: string, - doctype: string, - datetime: string, - poster: string, - postdate: string - text: [string] - }] - """ - return { - 'docid': _get_one(docid_re, text_data, required=True), - 'doctype': _get_one(doctype_re, text_data, required=True), - 'datetime': _get_one(datetime_re, text_data, required=True), - 'headline': _get_one(headline_re, text_data, required=True), - 'poster': _get_one(poster_re, _get_one(post_re, text_data)), - 'postdate': _get_one(postdate_re, _get_one(post_re, text_data)), - 'text': _get_text(_get_one(post_re, text_data)).strip() - } - - -def _get_one(regex, text, required=False): - matches = regex.search(text) - if not matches and not required: - return '' - assert len(matches.groups()) == 1, matches - return matches.groups()[0].strip() - - -def _get_text(data): - return tag_re.sub('', data).replace('

', '').replace('

', '') diff --git a/spacy/munge/read_ptb.py b/spacy/munge/read_ptb.py deleted file mode 100644 index 609397ba0..000000000 --- a/spacy/munge/read_ptb.py +++ /dev/null @@ -1,65 +0,0 @@ -import re -import os -from os import path - - -def parse(sent_text, strip_bad_periods=False): - sent_text = sent_text.strip() - assert sent_text and sent_text.startswith('(') - open_brackets = [] - brackets = [] - bracketsRE = re.compile(r'(\()([^\s\)\(]+)|([^\s\)\(]+)?(\))') - word_i = 0 - words = [] - # Remove outermost bracket - if sent_text.startswith('(('): - sent_text = sent_text.replace('((', '( (', 1) - for match in bracketsRE.finditer(sent_text[2:-1]): - open_, label, text, close = match.groups() - if open_: - assert not close - assert label.strip() - open_brackets.append((label, word_i)) - else: - assert close - label, start = open_brackets.pop() - assert label.strip() - if strip_bad_periods and words and _is_bad_period(words[-1], text): - continue - # Traces leave 0-width bracket, but no token - if text and label != '-NONE-': - words.append(text) - word_i += 1 - else: - brackets.append((label, start, word_i)) - return words, brackets - - -def _is_bad_period(prev, period): - if period != '.': - return False - elif prev == '.': - return False - elif not prev.endswith('.'): - return False - else: - return True - - -def split(text): - sentences = [] - current = [] - - for line in text.strip().split('\n'): - line = line.rstrip() - if not line: - continue - # Detect the start of sentences by line starting with ( - # This is messy, but it keeps bracket parsing at the sentence level - if line.startswith('(') and current: - sentences.append('\n'.join(current)) - current = [] - current.append(line) - if current: - sentences.append('\n'.join(current)) - return sentences