From fcd9490d56a796e003b80810999fcd0645e85707 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 2 Nov 2014 14:21:43 +1100 Subject: [PATCH] * Add pos_tag method to Language --- spacy/lang.pxd | 4 ++++ spacy/lang.pyx | 16 ++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/spacy/lang.pxd b/spacy/lang.pxd index 5287132c9..356953177 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -6,6 +6,7 @@ from cymem.cymem cimport Pool from .typedefs cimport hash_t from .tokens cimport Tokens from .lexeme cimport Lexeme +from .pos cimport Tagger as PosTagger from .utf8string cimport StringStore @@ -40,11 +41,14 @@ cdef class Language: cdef PreshMap _specials cpdef readonly Lexicon lexicon + cpdef readonly PosTagger pos_tagger + cdef object _prefix_re cdef object _suffix_re cdef object _infix_re cpdef Tokens tokenize(self, unicode text) + cpdef Tokens pos_tag(self, Tokens t) cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1 cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes, diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 9c5b5100f..1058d9acd 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -23,6 +23,8 @@ from . import util from .util import read_lang_data from .tokens import Tokens +from .pos cimport Tagger as PosTagger + cdef class Language: def __init__(self, name): @@ -39,6 +41,10 @@ cdef class Language: self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes')) self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings')) self._load_special_tokenization(rules) + if path.exists(path.join(util.DATA_DIR, name, 'pos')): + self.pos_tagger = PosTagger(path.join(util.DATA_DIR, name, 'pos')) + else: + self.pos_tagger = None cpdef Tokens tokenize(self, unicode string): """Tokenize a string. @@ -87,6 +93,16 @@ cdef class Language: self._tokenize(tokens, &span, start, i) return tokens + cpdef Tokens pos_tag(self, Tokens t): + if self.pos_tagger is None: + return t + cdef int i + t.pos[-1] = self.pos_tagger.encode_pos('EOL') + t.pos[-2] = self.pos_tagger.encode_pos('EOL') + for i in range(t.length): + t.pos[i] = self.pos_tagger.predict(i, t, t.pos[i-1], t.pos[i-2]) + return t + cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1: cdef vector[Lexeme*] prefixes cdef vector[Lexeme*] suffixes