From 141639ea3a8fc65f006d16594ab207b3a1c8c7d5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 21 Feb 2016 23:17:47 +0000 Subject: [PATCH 1/3] * Fix bug in tokenizer that caused new tokens to be added for affixes --- spacy/tokenizer.pxd | 2 +- spacy/tokenizer.pyx | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index c07e87bbc..2fc192d12 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -25,7 +25,7 @@ cdef class Tokenizer: cdef int _try_cache(self, hash_t key, Doc tokens) except -1 cdef int _tokenize(self, Doc tokens, unicode span, hash_t key) except -1 - cdef unicode _split_affixes(self, unicode string, vector[LexemeC*] *prefixes, + cdef unicode _split_affixes(self, Pool mem, unicode string, vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) cdef int _attach_tokens(self, Doc tokens, unicode string, vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1 diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index ad3a500a3..f8613fce8 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -155,11 +155,11 @@ cdef class Tokenizer: cdef vector[LexemeC*] suffixes cdef int orig_size orig_size = tokens.length - span = self._split_affixes(span, &prefixes, &suffixes) + span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes) self._attach_tokens(tokens, span, &prefixes, &suffixes) self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size) - cdef unicode _split_affixes(self, unicode string, vector[const LexemeC*] *prefixes, + cdef unicode _split_affixes(self, Pool mem, unicode string, vector[const LexemeC*] *prefixes, vector[const LexemeC*] *suffixes): cdef size_t i cdef unicode prefix @@ -176,7 +176,7 @@ cdef class Tokenizer: # Check whether we've hit a special-case if minus_pre and self._specials.get(hash_string(minus_pre)) != NULL: string = minus_pre - prefixes.push_back(self.vocab.get(self.vocab.mem, prefix)) + prefixes.push_back(self.vocab.get(mem, prefix)) break suf_len = self.find_suffix(string) if suf_len != 0: @@ -185,18 +185,18 @@ cdef class Tokenizer: # Check whether we've hit a special-case if minus_suf and (self._specials.get(hash_string(minus_suf)) != NULL): string = minus_suf - suffixes.push_back(self.vocab.get(self.vocab.mem, suffix)) + suffixes.push_back(self.vocab.get(mem, suffix)) break if pre_len and suf_len and (pre_len + suf_len) <= len(string): string = string[pre_len:-suf_len] - prefixes.push_back(self.vocab.get(self.vocab.mem, prefix)) - suffixes.push_back(self.vocab.get(self.vocab.mem, suffix)) + prefixes.push_back(self.vocab.get(mem, prefix)) + suffixes.push_back(self.vocab.get(mem, suffix)) elif pre_len: string = minus_pre - prefixes.push_back(self.vocab.get(self.vocab.mem, prefix)) + prefixes.push_back(self.vocab.get(mem, prefix)) elif suf_len: string = minus_suf - suffixes.push_back(self.vocab.get(self.vocab.mem, suffix)) + suffixes.push_back(self.vocab.get(mem, suffix)) if string and (self._specials.get(hash_string(string)) != NULL): break return string From eaccbcda0fc0705364cfc17277fe4ca3579fc2c2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 16 Mar 2016 06:04:14 +1100 Subject: [PATCH 2/3] Fix bug in pos_tag.py script --- examples/pos_tag.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pos_tag.py b/examples/pos_tag.py index a118966b0..c61d29636 100644 --- a/examples/pos_tag.py +++ b/examples/pos_tag.py @@ -54,7 +54,7 @@ def represent_word(word): # Only do this if the lower-cased form is more probable. if text.istitle() \ and is_sent_begin(word) \ - and word.prob < word.vocab[text.lower()].prob: + and word.prob < word.doc.vocab[text.lower()].prob: text = text.lower() return text + '|' + word.tag_ From 3c210f45fadd1134f1efa3eaabf9e48620212040 Mon Sep 17 00:00:00 2001 From: Yaser Martinez Palenzuela Date: Thu, 17 Mar 2016 12:19:52 +0100 Subject: [PATCH 3/3] make use of log_smooth_count --- bin/init_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/init_model.py b/bin/init_model.py index 19cfcdc25..b14015b39 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -129,7 +129,7 @@ def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200): word = key smooth_count = counts.smoother(int(freq)) log_smooth_count = math.log(smooth_count) - probs[word] = math.log(smooth_count) - log_total + probs[word] = log_smooth_count - log_total oov_prob = math.log(counts.smoother(0)) - log_total return probs, oov_prob