* Remove POS alignment stuff. Now use training data based on raw text, instead of clumsy detokenization stuff

This commit is contained in:
Matthew Honnibal 2014-11-04 01:06:43 +11:00
parent bea762ec04
commit f07457a91f
1 changed files with 28 additions and 49 deletions

View File

@ -6,56 +6,35 @@ from .en import EN
from .pos import Tagger from .pos import Tagger
def realign_tagged(token_rules, tagged_line, sep='/'): def read_gold(file_):
words, pos = zip(*[token.rsplit(sep, 1) for token in tagged_line.split()]) paras = file_.read().strip().split('\n\n')
positions = util.detokenize(token_rules, words) golds = []
aligned = [] for para in paras:
for group in positions: if not para.strip():
w_group = [words[i] for i in group]
p_group = [pos[i] for i in group]
aligned.append('<SEP>'.join(w_group) + sep + '_'.join(p_group))
return ' '.join(aligned)
def read_tagged(detoken_rules, file_, sep='/'):
sentences = []
for line in file_:
if not line.strip():
continue continue
line = realign_tagged(detoken_rules, line, sep=sep) lines = para.strip().split('\n')
tokens, tags = _parse_line(line, sep) raw = lines.pop(0)
assert len(tokens) == len(tags) gold_toks = lines.pop(0)
sentences.append((tokens, tags)) tokens = EN.tokenize(raw)
return sentences tags = []
conll_toks = []
for line in lines:
def _parse_line(line, sep): pieces = line.split()
words = [] conll_toks.append((int(pieces[0]), len(pieces[1]), pieces[3]))
tags = [] for i, token in enumerate(tokens):
for token_str in line.split(): if not conll_toks:
word, pos = token_str.rsplit(sep, 1) tags.append('NULL')
word = word.replace('<SEP>', '') elif token.idx == conll_toks[0][0]:
subtokens = EN.tokenize(word) tags.append(conll_toks[0][2])
subtags = pos.split('_') conll_toks.pop(0)
while len(subtags) < len(subtokens): elif token.idx < conll_toks[0]:
subtags.append('NULL') tags.append('NULL')
assert len(subtags) == len(subtokens), [t.string for t in subtokens] else:
words.append(word) conll_toks.pop(0)
tags.extend([Tagger.encode_pos(ptb_to_univ(pos)) for pos in subtags]) assert len(tags) == len(tokens)
tokens = EN.tokenize(' '.join(words)), tags tags = [Tagger.encode_pos(t) for t in tags]
return tokens golds.append((tokens, tags))
return golds
def get_tagdict(train_sents):
tagdict = {}
for tokens, tags in train_sents:
for i, tag in enumerate(tags):
if tag == 'NULL':
continue
word = tokens.string(i)
tagdict.setdefault(word, {}).setdefault(tag, 0)
tagdict[word][tag] += 1
return tagdict
def ptb_to_univ(tag): def ptb_to_univ(tag):