From 224bdae9962e6e252b37e68bb325795cea6000ac Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 22 Oct 2014 10:17:57 +1100 Subject: [PATCH] * Add POS utilities --- spacy/pos_util.py | 55 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 spacy/pos_util.py diff --git a/spacy/pos_util.py b/spacy/pos_util.py new file mode 100644 index 000000000..5acb1fc64 --- /dev/null +++ b/spacy/pos_util.py @@ -0,0 +1,55 @@ +from __future__ import unicode_literals +from . import util +from . import tokens +from .en import EN + +from .pos import Tagger + + +def realign_tagged(token_rules, tagged_line, sep='/'): + words, pos = zip(*[token.rsplit(sep, 1) for token in tagged_line.split()]) + positions = util.detokenize(token_rules, words) + aligned = [] + for group in positions: + w_group = [words[i] for i in group] + p_group = [pos[i] for i in group] + aligned.append(''.join(w_group) + sep + '_'.join(p_group)) + return ' '.join(aligned) + + +def read_tagged(detoken_rules, file_, sep='/'): + sentences = [] + for line in file_: + line = realign_tagged(detoken_rules, line, sep=sep) + tokens, tags = _parse_line(line, sep) + assert len(tokens) == len(tags) + sentences.append((tokens, tags)) + return sentences + + +def _parse_line(line, sep): + words = [] + tags = [] + for token_str in line.split(): + word, pos = token_str.rsplit(sep, 1) + word = word.replace('', '') + subtokens = EN.tokenize(word) + subtags = pos.split('_') + while len(subtags) < len(subtokens): + subtags.append('NULL') + assert len(subtags) == len(subtokens), [t.string for t in subtokens] + words.append(word) + tags.extend([Tagger.encode_pos(pos) for pos in subtags]) + return EN.tokenize(' '.join(words)), tags + + +def get_tagdict(train_sents): + tagdict = {} + for tokens, tags in train_sents: + for i, tag in enumerate(tags): + if tag == 'NULL': + continue + word = tokens.string(i) + tagdict.setdefault(word, {}).setdefault(tag, 0) + tagdict[word][tag] += 1 + return tagdict