mirror of https://github.com/explosion/spaCy.git
* Pass pos_tags into Tokenizer.from_dir
This commit is contained in:
parent
6788c86b2f
commit
2d0e99a096
|
@ -31,14 +31,12 @@ cdef class Tokenizer:
|
||||||
self._load_special_tokenization(rules, pos_tags)
|
self._load_special_tokenization(rules, pos_tags)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_dir(cls, Vocab vocab, directory):
|
def from_dir(cls, Vocab vocab, data_dir, pos_tags):
|
||||||
data_dir = path.join(data_dir, 'tokenizer')
|
rules, prefix_re, suffix_re, infix_re = read_lang_data(data_dir)
|
||||||
rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)
|
|
||||||
prefix_re = re.compile(prefix_re)
|
prefix_re = re.compile(prefix_re)
|
||||||
suffix_re = re.compile(suffix_re)
|
suffix_re = re.compile(suffix_re)
|
||||||
infix_re = re.compile(infix_re)
|
infix_re = re.compile(infix_re)
|
||||||
return cls(vocab, tok_rules, prefix_re, suffix_re, infix_re,
|
return cls(vocab, rules, prefix_re, suffix_re, infix_re, pos_tags)
|
||||||
pos_tags)
|
|
||||||
|
|
||||||
cpdef Tokens tokens_from_list(self, list strings):
|
cpdef Tokens tokens_from_list(self, list strings):
|
||||||
cdef int length = sum([len(s) for s in strings])
|
cdef int length = sum([len(s) for s in strings])
|
||||||
|
@ -257,7 +255,6 @@ cdef class Tokenizer:
|
||||||
else:
|
else:
|
||||||
tokens[i].lemma = 0
|
tokens[i].lemma = 0
|
||||||
if 'pos' in props:
|
if 'pos' in props:
|
||||||
# TODO: Clean up this mess...
|
|
||||||
tokens[i].tag = self.vocab.strings[props['pos']]
|
tokens[i].tag = self.vocab.strings[props['pos']]
|
||||||
tokens[i].pos = tag_map[props['pos']][0]
|
tokens[i].pos = tag_map[props['pos']][0]
|
||||||
# These are defaults, which can be over-ridden by the
|
# These are defaults, which can be over-ridden by the
|
||||||
|
|
Loading…
Reference in New Issue