mirror of https://github.com/explosion/spaCy.git
Remove unused munge package
This commit is contained in:
parent
c714841cc8
commit
7b83977020
|
@ -1,242 +0,0 @@
|
||||||
"""Align the raw sentences from Read et al (2012) to the PTB tokenization,
|
|
||||||
outputting as a .json file. Used in bin/prepare_treebank.py
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import plac
|
|
||||||
from pathlib import Path
|
|
||||||
import json
|
|
||||||
from os import path
|
|
||||||
import os
|
|
||||||
|
|
||||||
from spacy.munge import read_ptb
|
|
||||||
from spacy.munge.read_ontonotes import sgml_extract
|
|
||||||
|
|
||||||
|
|
||||||
def read_odc(section_loc):
|
|
||||||
# Arbitrary patches applied to the _raw_ text to promote alignment.
|
|
||||||
patches = (
|
|
||||||
('. . . .', '...'),
|
|
||||||
('....', '...'),
|
|
||||||
('Co..', 'Co.'),
|
|
||||||
("`", "'"),
|
|
||||||
# OntoNotes specific
|
|
||||||
(" S$", " US$"),
|
|
||||||
("Showtime or a sister service", "Showtime or a service"),
|
|
||||||
("The hotel and gaming company", "The hotel and Gaming company"),
|
|
||||||
("I'm-coming-down-your-throat", "I-'m coming-down-your-throat"),
|
|
||||||
)
|
|
||||||
|
|
||||||
paragraphs = []
|
|
||||||
with open(section_loc) as file_:
|
|
||||||
para = []
|
|
||||||
for line in file_:
|
|
||||||
if line.startswith('['):
|
|
||||||
line = line.split('|', 1)[1].strip()
|
|
||||||
for find, replace in patches:
|
|
||||||
line = line.replace(find, replace)
|
|
||||||
para.append(line)
|
|
||||||
else:
|
|
||||||
paragraphs.append(para)
|
|
||||||
para = []
|
|
||||||
paragraphs.append(para)
|
|
||||||
return paragraphs
|
|
||||||
|
|
||||||
|
|
||||||
def read_ptb_sec(ptb_sec_dir):
|
|
||||||
ptb_sec_dir = Path(ptb_sec_dir)
|
|
||||||
files = []
|
|
||||||
for loc in ptb_sec_dir.iterdir():
|
|
||||||
if not str(loc).endswith('parse') and not str(loc).endswith('mrg'):
|
|
||||||
continue
|
|
||||||
filename = loc.parts[-1].split('.')[0]
|
|
||||||
with loc.open() as file_:
|
|
||||||
text = file_.read()
|
|
||||||
sents = []
|
|
||||||
for parse_str in read_ptb.split(text):
|
|
||||||
words, brackets = read_ptb.parse(parse_str, strip_bad_periods=True)
|
|
||||||
words = [_reform_ptb_word(word) for word in words]
|
|
||||||
string = ' '.join(words)
|
|
||||||
sents.append((filename, string))
|
|
||||||
files.append(sents)
|
|
||||||
return files
|
|
||||||
|
|
||||||
|
|
||||||
def _reform_ptb_word(tok):
|
|
||||||
tok = tok.replace("``", '"')
|
|
||||||
tok = tok.replace("`", "'")
|
|
||||||
tok = tok.replace("''", '"')
|
|
||||||
tok = tok.replace('\\', '')
|
|
||||||
tok = tok.replace('-LCB-', '{')
|
|
||||||
tok = tok.replace('-RCB-', '}')
|
|
||||||
tok = tok.replace('-RRB-', ')')
|
|
||||||
tok = tok.replace('-LRB-', '(')
|
|
||||||
tok = tok.replace("'T-", "'T")
|
|
||||||
return tok
|
|
||||||
|
|
||||||
|
|
||||||
def get_alignment(raw_by_para, ptb_by_file):
|
|
||||||
# These are list-of-lists, by paragraph and file respectively.
|
|
||||||
# Flatten them into a list of (outer_id, inner_id, item) triples
|
|
||||||
raw_sents = _flatten(raw_by_para)
|
|
||||||
ptb_sents = list(_flatten(ptb_by_file))
|
|
||||||
|
|
||||||
output = []
|
|
||||||
ptb_idx = 0
|
|
||||||
n_skipped = 0
|
|
||||||
skips = []
|
|
||||||
for (p_id, p_sent_id, raw) in raw_sents:
|
|
||||||
if ptb_idx >= len(ptb_sents):
|
|
||||||
n_skipped += 1
|
|
||||||
continue
|
|
||||||
f_id, f_sent_id, (ptb_id, ptb) = ptb_sents[ptb_idx]
|
|
||||||
alignment = align_chars(raw, ptb)
|
|
||||||
if not alignment:
|
|
||||||
skips.append((ptb, raw))
|
|
||||||
n_skipped += 1
|
|
||||||
continue
|
|
||||||
ptb_idx += 1
|
|
||||||
sepped = []
|
|
||||||
for i, c in enumerate(ptb):
|
|
||||||
if alignment[i] is False:
|
|
||||||
sepped.append('<SEP>')
|
|
||||||
else:
|
|
||||||
sepped.append(c)
|
|
||||||
output.append((f_id, p_id, f_sent_id, (ptb_id, ''.join(sepped))))
|
|
||||||
if n_skipped + len(ptb_sents) != len(raw_sents):
|
|
||||||
for ptb, raw in skips:
|
|
||||||
print(ptb)
|
|
||||||
print(raw)
|
|
||||||
raise Exception
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
def _flatten(nested):
|
|
||||||
flat = []
|
|
||||||
for id1, inner in enumerate(nested):
|
|
||||||
flat.extend((id1, id2, item) for id2, item in enumerate(inner))
|
|
||||||
return flat
|
|
||||||
|
|
||||||
|
|
||||||
def align_chars(raw, ptb):
|
|
||||||
if raw.replace(' ', '') != ptb.replace(' ', ''):
|
|
||||||
return None
|
|
||||||
i = 0
|
|
||||||
j = 0
|
|
||||||
|
|
||||||
length = len(raw)
|
|
||||||
alignment = [False for _ in range(len(ptb))]
|
|
||||||
while i < length:
|
|
||||||
if raw[i] == ' ' and ptb[j] == ' ':
|
|
||||||
alignment[j] = True
|
|
||||||
i += 1
|
|
||||||
j += 1
|
|
||||||
elif raw[i] == ' ':
|
|
||||||
i += 1
|
|
||||||
elif ptb[j] == ' ':
|
|
||||||
j += 1
|
|
||||||
assert raw[i].lower() == ptb[j].lower(), raw[i:1]
|
|
||||||
alignment[j] = i
|
|
||||||
i += 1; j += 1
|
|
||||||
return alignment
|
|
||||||
|
|
||||||
|
|
||||||
def group_into_files(sents):
|
|
||||||
last_id = 0
|
|
||||||
last_fn = None
|
|
||||||
this = []
|
|
||||||
output = []
|
|
||||||
for f_id, p_id, s_id, (filename, sent) in sents:
|
|
||||||
if f_id != last_id:
|
|
||||||
assert last_fn is not None
|
|
||||||
output.append((last_fn, this))
|
|
||||||
this = []
|
|
||||||
last_fn = filename
|
|
||||||
this.append((f_id, p_id, s_id, sent))
|
|
||||||
last_id = f_id
|
|
||||||
if this:
|
|
||||||
assert last_fn is not None
|
|
||||||
output.append((last_fn, this))
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
def group_into_paras(sents):
|
|
||||||
last_id = 0
|
|
||||||
this = []
|
|
||||||
output = []
|
|
||||||
for f_id, p_id, s_id, sent in sents:
|
|
||||||
if p_id != last_id and this:
|
|
||||||
output.append(this)
|
|
||||||
this = []
|
|
||||||
this.append(sent)
|
|
||||||
last_id = p_id
|
|
||||||
if this:
|
|
||||||
output.append(this)
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
def get_sections(odc_dir, ptb_dir, out_dir):
|
|
||||||
for i in range(25):
|
|
||||||
section = str(i) if i >= 10 else ('0' + str(i))
|
|
||||||
odc_loc = path.join(odc_dir, 'wsj%s.txt' % section)
|
|
||||||
ptb_sec = path.join(ptb_dir, section)
|
|
||||||
out_loc = path.join(out_dir, 'wsj%s.json' % section)
|
|
||||||
yield odc_loc, ptb_sec, out_loc
|
|
||||||
|
|
||||||
|
|
||||||
def align_section(raw_paragraphs, ptb_files):
|
|
||||||
aligned = get_alignment(raw_paragraphs, ptb_files)
|
|
||||||
return [(fn, group_into_paras(sents))
|
|
||||||
for fn, sents in group_into_files(aligned)]
|
|
||||||
|
|
||||||
|
|
||||||
def do_wsj(odc_dir, ptb_dir, out_dir):
|
|
||||||
for odc_loc, ptb_sec_dir, out_loc in get_sections(odc_dir, ptb_dir, out_dir):
|
|
||||||
files = align_section(read_odc(odc_loc), read_ptb_sec(ptb_sec_dir))
|
|
||||||
with open(out_loc, 'w') as file_:
|
|
||||||
json.dump(files, file_)
|
|
||||||
|
|
||||||
|
|
||||||
def do_web(src_dir, onto_dir, out_dir):
|
|
||||||
mapping = dict(line.split() for line in open(path.join(onto_dir, 'map.txt'))
|
|
||||||
if len(line.split()) == 2)
|
|
||||||
for annot_fn, src_fn in mapping.items():
|
|
||||||
if not annot_fn.startswith('eng'):
|
|
||||||
continue
|
|
||||||
|
|
||||||
ptb_loc = path.join(onto_dir, annot_fn + '.parse')
|
|
||||||
src_loc = path.join(src_dir, src_fn + '.sgm')
|
|
||||||
|
|
||||||
if path.exists(ptb_loc) and path.exists(src_loc):
|
|
||||||
src_doc = sgml_extract(open(src_loc).read())
|
|
||||||
ptb_doc = [read_ptb.parse(parse_str, strip_bad_periods=True)[0]
|
|
||||||
for parse_str in read_ptb.split(open(ptb_loc).read())]
|
|
||||||
print('Found')
|
|
||||||
else:
|
|
||||||
print('Miss')
|
|
||||||
|
|
||||||
|
|
||||||
def may_mkdir(parent, *subdirs):
|
|
||||||
if not path.exists(parent):
|
|
||||||
os.mkdir(parent)
|
|
||||||
for i in range(1, len(subdirs)):
|
|
||||||
directories = (parent,) + subdirs[:i]
|
|
||||||
subdir = path.join(*directories)
|
|
||||||
if not path.exists(subdir):
|
|
||||||
os.mkdir(subdir)
|
|
||||||
|
|
||||||
|
|
||||||
def main(odc_dir, onto_dir, out_dir):
|
|
||||||
may_mkdir(out_dir, 'wsj', 'align')
|
|
||||||
may_mkdir(out_dir, 'web', 'align')
|
|
||||||
#do_wsj(odc_dir, path.join(ontonotes_dir, 'wsj', 'orig'),
|
|
||||||
# path.join(out_dir, 'wsj', 'align'))
|
|
||||||
do_web(
|
|
||||||
path.join(onto_dir, 'data', 'english', 'metadata', 'context', 'wb', 'sel'),
|
|
||||||
path.join(onto_dir, 'data', 'english', 'annotations', 'wb'),
|
|
||||||
path.join(out_dir, 'web', 'align'))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
plac.call(main)
|
|
|
@ -1,49 +0,0 @@
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
def split(text):
|
|
||||||
return [sent.strip() for sent in text.split('\n\n') if sent.strip()]
|
|
||||||
|
|
||||||
|
|
||||||
def parse(sent_text, strip_bad_periods=False):
|
|
||||||
sent_text = sent_text.strip()
|
|
||||||
assert sent_text
|
|
||||||
annot = []
|
|
||||||
words = []
|
|
||||||
id_map = {-1: -1}
|
|
||||||
for i, line in enumerate(sent_text.split('\n')):
|
|
||||||
word, tag, head, dep = _parse_line(line)
|
|
||||||
if strip_bad_periods and words and _is_bad_period(words[-1], word):
|
|
||||||
continue
|
|
||||||
id_map[i] = len(words)
|
|
||||||
|
|
||||||
annot.append({
|
|
||||||
'id': len(words),
|
|
||||||
'word': word,
|
|
||||||
'tag': tag,
|
|
||||||
'head': int(head) - 1,
|
|
||||||
'dep': dep})
|
|
||||||
words.append(word)
|
|
||||||
for entry in annot:
|
|
||||||
entry['head'] = id_map[entry['head']]
|
|
||||||
return words, annot
|
|
||||||
|
|
||||||
|
|
||||||
def _is_bad_period(prev, period):
|
|
||||||
if period != '.':
|
|
||||||
return False
|
|
||||||
elif prev == '.':
|
|
||||||
return False
|
|
||||||
elif not prev.endswith('.'):
|
|
||||||
return False
|
|
||||||
else:
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_line(line):
|
|
||||||
pieces = line.split()
|
|
||||||
if len(pieces) == 4:
|
|
||||||
return pieces
|
|
||||||
else:
|
|
||||||
return pieces[1], pieces[3], pieces[5], pieces[6]
|
|
||||||
|
|
|
@ -1,116 +0,0 @@
|
||||||
from __future__ import unicode_literals
|
|
||||||
import os
|
|
||||||
from os import path
|
|
||||||
import re
|
|
||||||
|
|
||||||
|
|
||||||
def split(text):
|
|
||||||
"""Split an annotation file by sentence. Each sentence's annotation should
|
|
||||||
be a single string."""
|
|
||||||
return text.strip().split('\n')[1:-1]
|
|
||||||
|
|
||||||
|
|
||||||
def parse(string, strip_bad_periods=False):
|
|
||||||
"""Given a sentence's annotation string, return a list of word strings,
|
|
||||||
and a list of named entities, where each entity is a (start, end, label)
|
|
||||||
triple."""
|
|
||||||
tokens = []
|
|
||||||
tags = []
|
|
||||||
open_tag = None
|
|
||||||
# Arbitrary corrections to promote alignment, and ensure that entities
|
|
||||||
# begin at a space. This allows us to treat entities as tokens, making it
|
|
||||||
# easier to return the list of entities.
|
|
||||||
string = string.replace('... .', '...')
|
|
||||||
string = string.replace('U.S.</ENAMEX> .', 'U.S.</ENAMEX>')
|
|
||||||
string = string.replace('Co.</ENAMEX> .', 'Co.</ENAMEX>')
|
|
||||||
string = string.replace('U.S. .', 'U.S.')
|
|
||||||
string = string.replace('<ENAMEX ', '<ENAMEX')
|
|
||||||
string = string.replace(' E_OFF="', 'E_OFF="')
|
|
||||||
string = string.replace(' S_OFF="', 'S_OFF="')
|
|
||||||
string = string.replace('units</ENAMEX>-<ENAMEX', 'units</ENAMEX> - <ENAMEX')
|
|
||||||
string = string.replace('<ENAMEXTYPE="PERSON"E_OFF="1">Paula</ENAMEX> Zahn', 'Paula Zahn')
|
|
||||||
string = string.replace('<ENAMEXTYPE="CARDINAL"><ENAMEXTYPE="CARDINAL">little</ENAMEX> drain</ENAMEX>', 'little drain')
|
|
||||||
for substr in string.strip().split():
|
|
||||||
substr = _fix_inner_entities(substr)
|
|
||||||
tokens.append(_get_text(substr))
|
|
||||||
try:
|
|
||||||
tag, open_tag = _get_tag(substr, open_tag)
|
|
||||||
except:
|
|
||||||
raise
|
|
||||||
tags.append(tag)
|
|
||||||
return tokens, tags
|
|
||||||
|
|
||||||
|
|
||||||
tag_re = re.compile(r'<ENAMEXTYPE="[^"]+">')
|
|
||||||
def _fix_inner_entities(substr):
|
|
||||||
tags = tag_re.findall(substr)
|
|
||||||
if '</ENAMEX' in substr and not substr.endswith('</ENAMEX'):
|
|
||||||
substr = substr.replace('</ENAMEX>', '') + '</ENAMEX>'
|
|
||||||
if tags:
|
|
||||||
substr = tag_re.sub('', substr)
|
|
||||||
return tags[0] + substr
|
|
||||||
else:
|
|
||||||
return substr
|
|
||||||
|
|
||||||
|
|
||||||
def _get_tag(substr, tag):
|
|
||||||
if substr.startswith('<'):
|
|
||||||
tag = substr.split('"')[1]
|
|
||||||
if substr.endswith('>'):
|
|
||||||
return 'U-' + tag, None
|
|
||||||
else:
|
|
||||||
return 'B-%s' % tag, tag
|
|
||||||
elif substr.endswith('>'):
|
|
||||||
return 'L-' + tag, None
|
|
||||||
elif tag is not None:
|
|
||||||
return 'I-' + tag, tag
|
|
||||||
else:
|
|
||||||
return 'O', None
|
|
||||||
|
|
||||||
|
|
||||||
def _get_text(substr):
|
|
||||||
if substr.startswith('<'):
|
|
||||||
substr = substr.split('>', 1)[1]
|
|
||||||
if substr.endswith('>'):
|
|
||||||
substr = substr.split('<')[0]
|
|
||||||
return reform_string(substr)
|
|
||||||
|
|
||||||
|
|
||||||
def tags_to_entities(tags):
|
|
||||||
entities = []
|
|
||||||
start = None
|
|
||||||
for i, tag in enumerate(tags):
|
|
||||||
if tag.startswith('O'):
|
|
||||||
# TODO: We shouldn't be getting these malformed inputs. Fix this.
|
|
||||||
if start is not None:
|
|
||||||
start = None
|
|
||||||
continue
|
|
||||||
elif tag == '-':
|
|
||||||
continue
|
|
||||||
elif tag.startswith('I'):
|
|
||||||
assert start is not None, tags[:i]
|
|
||||||
continue
|
|
||||||
if tag.startswith('U'):
|
|
||||||
entities.append((tag[2:], i, i))
|
|
||||||
elif tag.startswith('B'):
|
|
||||||
start = i
|
|
||||||
elif tag.startswith('L'):
|
|
||||||
entities.append((tag[2:], start, i))
|
|
||||||
start = None
|
|
||||||
else:
|
|
||||||
raise Exception(tag)
|
|
||||||
return entities
|
|
||||||
|
|
||||||
|
|
||||||
def reform_string(tok):
|
|
||||||
tok = tok.replace("``", '"')
|
|
||||||
tok = tok.replace("`", "'")
|
|
||||||
tok = tok.replace("''", '"')
|
|
||||||
tok = tok.replace('\\', '')
|
|
||||||
tok = tok.replace('-LCB-', '{')
|
|
||||||
tok = tok.replace('-RCB-', '}')
|
|
||||||
tok = tok.replace('-RRB-', ')')
|
|
||||||
tok = tok.replace('-LRB-', '(')
|
|
||||||
tok = tok.replace("'T-", "'T")
|
|
||||||
tok = tok.replace('-AMP-', '&')
|
|
||||||
return tok
|
|
|
@ -1,47 +0,0 @@
|
||||||
import re
|
|
||||||
|
|
||||||
|
|
||||||
docid_re = re.compile(r'<DOCID>([^>]+)</DOCID>')
|
|
||||||
doctype_re = re.compile(r'<DOCTYPE SOURCE="[^"]+">([^>]+)</DOCTYPE>')
|
|
||||||
datetime_re = re.compile(r'<DATETIME>([^>]+)</DATETIME>')
|
|
||||||
headline_re = re.compile(r'<HEADLINE>(.+)</HEADLINE>', re.DOTALL)
|
|
||||||
post_re = re.compile(r'<POST>(.+)</POST>', re.DOTALL)
|
|
||||||
poster_re = re.compile(r'<POSTER>(.+)</POSTER>')
|
|
||||||
postdate_re = re.compile(r'<POSTDATE>(.+)</POSTDATE>')
|
|
||||||
tag_re = re.compile(r'<[^>]+>[^>]+</[^>]+>')
|
|
||||||
|
|
||||||
|
|
||||||
def sgml_extract(text_data):
|
|
||||||
"""Extract text from the OntoNotes web documents.
|
|
||||||
|
|
||||||
Format:
|
|
||||||
[{
|
|
||||||
docid: string,
|
|
||||||
doctype: string,
|
|
||||||
datetime: string,
|
|
||||||
poster: string,
|
|
||||||
postdate: string
|
|
||||||
text: [string]
|
|
||||||
}]
|
|
||||||
"""
|
|
||||||
return {
|
|
||||||
'docid': _get_one(docid_re, text_data, required=True),
|
|
||||||
'doctype': _get_one(doctype_re, text_data, required=True),
|
|
||||||
'datetime': _get_one(datetime_re, text_data, required=True),
|
|
||||||
'headline': _get_one(headline_re, text_data, required=True),
|
|
||||||
'poster': _get_one(poster_re, _get_one(post_re, text_data)),
|
|
||||||
'postdate': _get_one(postdate_re, _get_one(post_re, text_data)),
|
|
||||||
'text': _get_text(_get_one(post_re, text_data)).strip()
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def _get_one(regex, text, required=False):
|
|
||||||
matches = regex.search(text)
|
|
||||||
if not matches and not required:
|
|
||||||
return ''
|
|
||||||
assert len(matches.groups()) == 1, matches
|
|
||||||
return matches.groups()[0].strip()
|
|
||||||
|
|
||||||
|
|
||||||
def _get_text(data):
|
|
||||||
return tag_re.sub('', data).replace('<P>', '').replace('</P>', '')
|
|
|
@ -1,65 +0,0 @@
|
||||||
import re
|
|
||||||
import os
|
|
||||||
from os import path
|
|
||||||
|
|
||||||
|
|
||||||
def parse(sent_text, strip_bad_periods=False):
|
|
||||||
sent_text = sent_text.strip()
|
|
||||||
assert sent_text and sent_text.startswith('(')
|
|
||||||
open_brackets = []
|
|
||||||
brackets = []
|
|
||||||
bracketsRE = re.compile(r'(\()([^\s\)\(]+)|([^\s\)\(]+)?(\))')
|
|
||||||
word_i = 0
|
|
||||||
words = []
|
|
||||||
# Remove outermost bracket
|
|
||||||
if sent_text.startswith('(('):
|
|
||||||
sent_text = sent_text.replace('((', '( (', 1)
|
|
||||||
for match in bracketsRE.finditer(sent_text[2:-1]):
|
|
||||||
open_, label, text, close = match.groups()
|
|
||||||
if open_:
|
|
||||||
assert not close
|
|
||||||
assert label.strip()
|
|
||||||
open_brackets.append((label, word_i))
|
|
||||||
else:
|
|
||||||
assert close
|
|
||||||
label, start = open_brackets.pop()
|
|
||||||
assert label.strip()
|
|
||||||
if strip_bad_periods and words and _is_bad_period(words[-1], text):
|
|
||||||
continue
|
|
||||||
# Traces leave 0-width bracket, but no token
|
|
||||||
if text and label != '-NONE-':
|
|
||||||
words.append(text)
|
|
||||||
word_i += 1
|
|
||||||
else:
|
|
||||||
brackets.append((label, start, word_i))
|
|
||||||
return words, brackets
|
|
||||||
|
|
||||||
|
|
||||||
def _is_bad_period(prev, period):
|
|
||||||
if period != '.':
|
|
||||||
return False
|
|
||||||
elif prev == '.':
|
|
||||||
return False
|
|
||||||
elif not prev.endswith('.'):
|
|
||||||
return False
|
|
||||||
else:
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def split(text):
|
|
||||||
sentences = []
|
|
||||||
current = []
|
|
||||||
|
|
||||||
for line in text.strip().split('\n'):
|
|
||||||
line = line.rstrip()
|
|
||||||
if not line:
|
|
||||||
continue
|
|
||||||
# Detect the start of sentences by line starting with (
|
|
||||||
# This is messy, but it keeps bracket parsing at the sentence level
|
|
||||||
if line.startswith('(') and current:
|
|
||||||
sentences.append('\n'.join(current))
|
|
||||||
current = []
|
|
||||||
current.append(line)
|
|
||||||
if current:
|
|
||||||
sentences.append('\n'.join(current))
|
|
||||||
return sentences
|
|
Loading…
Reference in New Issue