spaCy/spacy/munge/read_ptb.py

66 lines
1.9 KiB
Python

import re
import os
from os import path
def parse(sent_text, strip_bad_periods=False):
sent_text = sent_text.strip()
assert sent_text and sent_text.startswith('(')
open_brackets = []
brackets = []
bracketsRE = re.compile(r'(\()([^\s\)\(]+)|([^\s\)\(]+)?(\))')
word_i = 0
words = []
# Remove outermost bracket
if sent_text.startswith('(('):
sent_text = sent_text.replace('((', '( (', 1)
for match in bracketsRE.finditer(sent_text[2:-1]):
open_, label, text, close = match.groups()
if open_:
assert not close
assert label.strip()
open_brackets.append((label, word_i))
else:
assert close
label, start = open_brackets.pop()
assert label.strip()
if strip_bad_periods and words and _is_bad_period(words[-1], text):
continue
# Traces leave 0-width bracket, but no token
if text and label != '-NONE-':
words.append(text)
word_i += 1
else:
brackets.append((label, start, word_i))
return words, brackets
def _is_bad_period(prev, period):
if period != '.':
return False
elif prev == '.':
return False
elif not prev.endswith('.'):
return False
else:
return True
def split(text):
sentences = []
current = []
for line in text.strip().split('\n'):
line = line.rstrip()
if not line:
continue
# Detect the start of sentences by line starting with (
# This is messy, but it keeps bracket parsing at the sentence level
if line.startswith('(') and current:
sentences.append('\n'.join(current))
current = []
current.append(line)
if current:
sentences.append('\n'.join(current))
return sentences