From 2d0e99a096eb534cde300678a131a1ecc0074f11 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 7 Jul 2015 14:23:08 +0200 Subject: [PATCH] * Pass pos_tags into Tokenizer.from_dir --- spacy/tokenizer.pyx | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 353dfd81c..4dd33ab15 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -31,14 +31,12 @@ cdef class Tokenizer: self._load_special_tokenization(rules, pos_tags) @classmethod - def from_dir(cls, Vocab vocab, directory): - data_dir = path.join(data_dir, 'tokenizer') - rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir) + def from_dir(cls, Vocab vocab, data_dir, pos_tags): + rules, prefix_re, suffix_re, infix_re = read_lang_data(data_dir) prefix_re = re.compile(prefix_re) suffix_re = re.compile(suffix_re) infix_re = re.compile(infix_re) - return cls(vocab, tok_rules, prefix_re, suffix_re, infix_re, - pos_tags) + return cls(vocab, rules, prefix_re, suffix_re, infix_re, pos_tags) cpdef Tokens tokens_from_list(self, list strings): cdef int length = sum([len(s) for s in strings]) @@ -257,7 +255,6 @@ cdef class Tokenizer: else: tokens[i].lemma = 0 if 'pos' in props: - # TODO: Clean up this mess... tokens[i].tag = self.vocab.strings[props['pos']] tokens[i].pos = tag_map[props['pos']][0] # These are defaults, which can be over-ridden by the