From 792802b2b99f19778491f7a008ceece03cccc5ac Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 12 Dec 2014 14:33:51 +1100 Subject: [PATCH] * POS tag memoisation working, with good speed-up --- spacy/en.pyx | 33 ++++++++++++++++++++++++++++++--- spacy/lang.pxd | 1 + spacy/lang.pyx | 2 ++ 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/spacy/en.pyx b/spacy/en.pyx index 3ed0eaaa9..80bea551f 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -32,8 +32,10 @@ provides a fully Penn Treebank 3-compliant tokenizer. ''' from __future__ import unicode_literals +from murmurhash.mrmr cimport hash64 + cimport lang -from .typedefs cimport flags_t +from .typedefs cimport hash_t, id_t, flags_t import orth from .morphology cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB from .morphology cimport X, PUNCT, EOL @@ -41,6 +43,9 @@ from .morphology cimport X, PUNCT, EOL from .tokens cimport Morphology +DEF USE_POS_CACHE = True + + POS_TAGS = { 'NULL': (NO_TAG, {}), 'EOL': (EOL, {}), @@ -134,6 +139,20 @@ cdef class English(Language): name (unicode): The two letter code used by Wikipedia for the language. lexicon (Lexicon): The lexicon. Exposes the lookup method. """ + def load_pos_cache(self, loc): + cdef int i = 0 + cdef hash_t key + cdef int pos + with open(loc) as file_: + for line in file_: + pieces = line.split() + if i >= 500000: + break + i += 1 + key = int(pieces[1]) + pos = int(pieces[2]) + self._pos_cache.set(key, pos) + def get_props(self, unicode string): return {'flags': self.set_flags(string), 'dense': orth.word_shape(string)} @@ -156,11 +175,19 @@ cdef class English(Language): cdef int i cdef atom_t[N_CONTEXT_FIELDS] context cdef TokenC* t = tokens.data + cdef id_t[2] bigram + cdef hash_t cache_key + cdef void* cached = NULL assert self.morphologizer is not None cdef dict tagdict = self.pos_tagger.tagdict for i in range(tokens.length): - if t[i].lex.sic in tagdict: - t[i].pos = tagdict[t[i].lex.sic] + if USE_POS_CACHE: + bigram[0] = tokens.data[i].lex.sic + bigram[1] = tokens.data[i-1].lex.sic + cache_key = hash64(bigram, sizeof(id_t) * 2, 0) + cached = self._pos_cache.get(cache_key) + if cached != NULL: + t[i].pos = cached else: fill_pos_context(context, i, t) t[i].pos = self.pos_tagger.predict(context) diff --git a/spacy/lang.pxd b/spacy/lang.pxd index 20986f134..47b70d9ca 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -44,6 +44,7 @@ cdef class Language: cpdef readonly Tagger pos_tagger cpdef readonly Morphologizer morphologizer + cdef PreshMap _pos_cache cdef object _prefix_re cdef object _suffix_re cdef object _infix_re diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 4617c3853..4a2ae2da0 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -34,6 +34,7 @@ cdef class Language: self.mem = Pool() self._cache = PreshMap(2 ** 25) self._specials = PreshMap(2 ** 16) + self._pos_cache = PreshMap(2 ** 16) rules, prefix, suffix, infix = util.read_lang_data(name) self._prefix_re = re.compile(prefix) self._suffix_re = re.compile(suffix) @@ -50,6 +51,7 @@ cdef class Language: self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos')) self.morphologizer = Morphologizer(self.lexicon.strings, path.join(util.DATA_DIR, self.name)) + self.load_pos_cache(path.join(util.DATA_DIR, self.name, 'pos', 'bigram_cache_2m')) cpdef Tokens tokens_from_list(self, list strings): cdef int length = sum([len(s) for s in strings])