mirror of https://github.com/explosion/spaCy.git
* Remove POS alignment stuff. Now use training data based on raw text, instead of clumsy detokenization stuff
This commit is contained in:
parent
bea762ec04
commit
f07457a91f
|
@ -6,56 +6,35 @@ from .en import EN
|
||||||
from .pos import Tagger
|
from .pos import Tagger
|
||||||
|
|
||||||
|
|
||||||
def realign_tagged(token_rules, tagged_line, sep='/'):
|
def read_gold(file_):
|
||||||
words, pos = zip(*[token.rsplit(sep, 1) for token in tagged_line.split()])
|
paras = file_.read().strip().split('\n\n')
|
||||||
positions = util.detokenize(token_rules, words)
|
golds = []
|
||||||
aligned = []
|
for para in paras:
|
||||||
for group in positions:
|
if not para.strip():
|
||||||
w_group = [words[i] for i in group]
|
|
||||||
p_group = [pos[i] for i in group]
|
|
||||||
aligned.append('<SEP>'.join(w_group) + sep + '_'.join(p_group))
|
|
||||||
return ' '.join(aligned)
|
|
||||||
|
|
||||||
|
|
||||||
def read_tagged(detoken_rules, file_, sep='/'):
|
|
||||||
sentences = []
|
|
||||||
for line in file_:
|
|
||||||
if not line.strip():
|
|
||||||
continue
|
continue
|
||||||
line = realign_tagged(detoken_rules, line, sep=sep)
|
lines = para.strip().split('\n')
|
||||||
tokens, tags = _parse_line(line, sep)
|
raw = lines.pop(0)
|
||||||
assert len(tokens) == len(tags)
|
gold_toks = lines.pop(0)
|
||||||
sentences.append((tokens, tags))
|
tokens = EN.tokenize(raw)
|
||||||
return sentences
|
tags = []
|
||||||
|
conll_toks = []
|
||||||
|
for line in lines:
|
||||||
def _parse_line(line, sep):
|
pieces = line.split()
|
||||||
words = []
|
conll_toks.append((int(pieces[0]), len(pieces[1]), pieces[3]))
|
||||||
tags = []
|
for i, token in enumerate(tokens):
|
||||||
for token_str in line.split():
|
if not conll_toks:
|
||||||
word, pos = token_str.rsplit(sep, 1)
|
tags.append('NULL')
|
||||||
word = word.replace('<SEP>', '')
|
elif token.idx == conll_toks[0][0]:
|
||||||
subtokens = EN.tokenize(word)
|
tags.append(conll_toks[0][2])
|
||||||
subtags = pos.split('_')
|
conll_toks.pop(0)
|
||||||
while len(subtags) < len(subtokens):
|
elif token.idx < conll_toks[0]:
|
||||||
subtags.append('NULL')
|
tags.append('NULL')
|
||||||
assert len(subtags) == len(subtokens), [t.string for t in subtokens]
|
else:
|
||||||
words.append(word)
|
conll_toks.pop(0)
|
||||||
tags.extend([Tagger.encode_pos(ptb_to_univ(pos)) for pos in subtags])
|
assert len(tags) == len(tokens)
|
||||||
tokens = EN.tokenize(' '.join(words)), tags
|
tags = [Tagger.encode_pos(t) for t in tags]
|
||||||
return tokens
|
golds.append((tokens, tags))
|
||||||
|
return golds
|
||||||
|
|
||||||
def get_tagdict(train_sents):
|
|
||||||
tagdict = {}
|
|
||||||
for tokens, tags in train_sents:
|
|
||||||
for i, tag in enumerate(tags):
|
|
||||||
if tag == 'NULL':
|
|
||||||
continue
|
|
||||||
word = tokens.string(i)
|
|
||||||
tagdict.setdefault(word, {}).setdefault(tag, 0)
|
|
||||||
tagdict[word][tag] += 1
|
|
||||||
return tagdict
|
|
||||||
|
|
||||||
|
|
||||||
def ptb_to_univ(tag):
|
def ptb_to_univ(tag):
|
||||||
|
|
Loading…
Reference in New Issue