diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index f24ed7425..2fc1dba9e 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -21,7 +21,8 @@ from .tokens import Tokens cdef class Tokenizer: - def __init__(self, Vocab vocab, rules, prefix_re, suffix_re, infix_re): + def __init__(self, Vocab vocab, rules, prefix_re, suffix_re, infix_re, + pos_tags, tag_names): self.mem = Pool() self._cache = PreshMap() self._specials = PreshMap() @@ -29,10 +30,10 @@ cdef class Tokenizer: self._suffix_re = suffix_re self._infix_re = infix_re self.vocab = vocab - self._load_special_tokenization(rules) + self._load_special_tokenization(rules, pos_tags, tag_names) @classmethod - def from_dir(cls, Vocab vocab, object data_dir): + def from_dir(cls, Vocab vocab, object data_dir, object pos_tags, object tag_names): if not path.exists(data_dir): raise IOError("Directory %s not found -- cannot load Tokenizer." % data_dir) if not path.isdir(data_dir): @@ -41,7 +42,7 @@ cdef class Tokenizer: assert path.exists(data_dir) and path.isdir(data_dir) rules, prefix_re, suffix_re, infix_re = util.read_lang_data(data_dir) return cls(vocab, rules, re.compile(prefix_re), re.compile(suffix_re), - re.compile(infix_re)) + re.compile(infix_re), pos_tags, tag_names) cpdef Tokens tokens_from_list(self, list strings): cdef int length = sum([len(s) for s in strings]) @@ -234,7 +235,7 @@ cdef class Tokenizer: match = self._suffix_re.search(string) return (match.end() - match.start()) if match is not None else 0 - def _load_special_tokenization(self, object rules): + def _load_special_tokenization(self, object rules, object tag_map, object tag_names): '''Add a special-case tokenization rule. ''' cdef int i @@ -255,6 +256,13 @@ cdef class Tokenizer: tokens[i].lex = self.vocab.get(self.vocab.mem, &string) if lemma: tokens[i].lemma = self.vocab.strings[lemma] + if 'pos' in props: + # TODO: Clean up this mess... + tokens[i].fine_pos = tag_names.index(props['pos']) + tokens[i].pos = tag_map[props['pos']][0] + # These are defaults, which can be over-ridden by the + # token-specific props. + set_morph_from_dict(&tokens[i].morph, tag_map[props['pos']][1]) set_morph_from_dict(&tokens[i].morph, props) cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached.length = len(substrings)