* Adapt POS tagger decision-memory for use in parser

This commit is contained in:
Matthew Honnibal 2014-12-19 07:23:04 +11:00
parent 809ddf7887
commit 033d6c9ac2
1 changed files with 46 additions and 64 deletions

View File

@ -10,6 +10,7 @@ from .tokens cimport TokenC
from .typedefs cimport hash_t from .typedefs cimport hash_t
from preshed.maps cimport MapStruct, Cell, map_get, map_set, map_init from preshed.maps cimport MapStruct, Cell, map_get, map_set, map_init
from murmurhash.mrmr cimport hash64
cdef class Index: cdef class Index:
@ -34,94 +35,75 @@ cdef class Index:
self.counts.push_back(doc_counts) self.counts.push_back(doc_counts)
cdef class DecisionMemory:
cdef class PosMemory: def __init__(self, class_names):
def __init__(self, tag_names): self.class_names = class_names
self.tag_names = tag_names self.n_classes = len(class_names)
self.nr_tags = len(tag_names)
self.mem = Pool() self.mem = Pool()
self._counts = PreshCounter() self._counts = PreshCounter()
self._pos_counts = PreshCounter() self._class_counts = PreshCounter()
def __getitem__(self, ids): def __getitem__(self, ids):
cdef id_t[2] ngram cdef id_t[2] context
ngram[0] = ids[0] context[0] = context[0]
ngram[1] = ids[1] context[1] = context[1]
cdef hash_t ngram_key = hash64(ngram, 2 * sizeof(id_t), 0) cdef hash_t context_key = hash64(context, 2 * sizeof(id_t), 0)
cdef hash_t[2] pos_context cdef hash_t[2] class_context
pos_context[0] = ngram_key class_context[0] = context_key
counts = {} counts = {}
cdef id_t i cdef id_t i
for i, tag in enumerate(self.tag_names): for i, clas in enumerate(self.clas_names):
pos_context[1] = <hash_t>i class_context[1] = <hash_t>i
key = hash64(pos_context, sizeof(hash_t) * 2, 0) key = hash64(class_context, sizeof(hash_t) * 2, 0)
count = self._pos_counts[key] count = self._class_counts[key]
counts[tag] = count counts[clas] = count
return counts return counts
@cython.cdivision(True) @cython.cdivision(True)
def iter_ngrams(self, float min_acc=0.99, count_t min_freq=10): def iter_contexts(self, float min_acc=0.99, count_t min_freq=10):
cdef Address counts_addr = Address(self.nr_tags, sizeof(count_t)) cdef Address counts_addr = Address(self.n_classes, sizeof(count_t))
cdef count_t* counts = <count_t*>counts_addr.ptr cdef count_t* counts = <count_t*>counts_addr.ptr
cdef MapStruct* ngram_counts = self._counts.c_map cdef MapStruct* context_counts = self._counts.c_map
cdef hash_t ngram_key cdef hash_t context_key
cdef count_t ngram_freq cdef count_t context_freq
cdef int best_pos cdef int best_class
cdef float acc cdef float acc
cdef int i cdef int i
for i in range(ngram_counts.length): for i in range(context_counts.length):
ngram_key = ngram_counts.cells[i].key context_key = context_counts.cells[i].key
ngram_freq = <count_t>ngram_counts.cells[i].value context_freq = <count_t>context_counts.cells[i].value
if ngram_key != 0 and ngram_freq >= min_freq: if context_key != 0 and context_freq >= min_freq:
best_pos = self.find_best_pos(counts, ngram_key) best_class = self.find_best_class(counts, context_key)
acc = counts[best_pos] / ngram_freq acc = counts[best_class] / context_freq
if acc >= min_acc: if acc >= min_acc:
yield counts[best_pos], ngram_key, best_pos yield counts[best_class], context_key, best_class
cpdef int count(self, Tokens tokens) except -1: cdef int inc(self, hash_t context_key, hash_t clas, count_t inc) except -1:
cdef int i cdef hash_t context_and_class_key
cdef TokenC* t cdef hash_t[2] context_and_class
for i in range(tokens.length): context_and_class[0] = context_key
t = &tokens.data[i] context_and_class[1] = clas
if t.lex.prob != 0 and t.lex.prob >= -14: context_and_class_key = hash64(context_and_class, 2 * sizeof(hash_t), 0)
self.inc(t, 1) self._counts.inc(context_key, inc)
self._class_counts.inc(context_and_class_key, inc)
cdef int inc(self, TokenC* word, count_t inc) except -1: cdef int find_best_class(self, count_t* counts, hash_t context_key) except -1:
cdef hash_t[2] ngram_pos_context
cdef hash_t ngram_key = self._ngram_key(word)
ngram_pos_context[0] = ngram_key
ngram_pos_context[1] = <hash_t>word.pos
ngram_pos_key = hash64(ngram_pos_context, 2 * sizeof(hash_t), 0)
self._counts.inc(ngram_key, inc)
self._pos_counts.inc(ngram_pos_key, inc)
cdef int find_best_pos(self, count_t* counts, hash_t ngram_key) except -1:
cdef hash_t[2] unhashed_key cdef hash_t[2] unhashed_key
unhashed_key[0] = ngram_key unhashed_key[0] = context_key
cdef count_t total = 0 cdef count_t total = 0
cdef hash_t key cdef hash_t key
cdef int pos cdef int clas
cdef int best cdef int best
cdef int mode = 0 cdef int mode = 0
for pos in range(self.nr_tags): for clas in range(self.n_classes):
unhashed_key[1] = <hash_t>pos unhashed_key[1] = <hash_t>clas
key = hash64(unhashed_key, sizeof(hash_t) * 2, 0) key = hash64(unhashed_key, sizeof(hash_t) * 2, 0)
count = self._pos_counts[key] count = self._class_counts[key]
counts[pos] = count counts[clas] = count
if count >= mode: if count >= mode:
mode = count mode = count
best = pos best = clas
total += count total += count
return best return best
cdef count_t ngram_count(self, TokenC* word) except -1:
cdef hash_t ngram_key = self._ngram_key(word)
return self._counts[ngram_key]
cdef hash_t _ngram_key(self, TokenC* word) except 0:
cdef id_t[2] context
context[0] = word.lex.sic
context[1] = word[-1].lex.sic
return hash64(context, sizeof(id_t) * 2, 0)