* Pass pos_tags into Tokenizer.from_dir

This commit is contained in:
Matthew Honnibal 2015-07-07 14:23:08 +02:00
parent 6788c86b2f
commit 2d0e99a096
1 changed files with 3 additions and 6 deletions

View File

@ -31,14 +31,12 @@ cdef class Tokenizer:
self._load_special_tokenization(rules, pos_tags) self._load_special_tokenization(rules, pos_tags)
@classmethod @classmethod
def from_dir(cls, Vocab vocab, directory): def from_dir(cls, Vocab vocab, data_dir, pos_tags):
data_dir = path.join(data_dir, 'tokenizer') rules, prefix_re, suffix_re, infix_re = read_lang_data(data_dir)
rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)
prefix_re = re.compile(prefix_re) prefix_re = re.compile(prefix_re)
suffix_re = re.compile(suffix_re) suffix_re = re.compile(suffix_re)
infix_re = re.compile(infix_re) infix_re = re.compile(infix_re)
return cls(vocab, tok_rules, prefix_re, suffix_re, infix_re, return cls(vocab, rules, prefix_re, suffix_re, infix_re, pos_tags)
pos_tags)
cpdef Tokens tokens_from_list(self, list strings): cpdef Tokens tokens_from_list(self, list strings):
cdef int length = sum([len(s) for s in strings]) cdef int length = sum([len(s) for s in strings])
@ -257,7 +255,6 @@ cdef class Tokenizer:
else: else:
tokens[i].lemma = 0 tokens[i].lemma = 0
if 'pos' in props: if 'pos' in props:
# TODO: Clean up this mess...
tokens[i].tag = self.vocab.strings[props['pos']] tokens[i].tag = self.vocab.strings[props['pos']]
tokens[i].pos = tag_map[props['pos']][0] tokens[i].pos = tag_map[props['pos']][0]
# These are defaults, which can be over-ridden by the # These are defaults, which can be over-ridden by the