mirror of https://github.com/explosion/spaCy.git
Merge branch 'master' of github.com:spacy-io/spaCy
This commit is contained in:
commit
963570aa49
|
@ -129,7 +129,7 @@ def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200):
|
||||||
word = key
|
word = key
|
||||||
smooth_count = counts.smoother(int(freq))
|
smooth_count = counts.smoother(int(freq))
|
||||||
log_smooth_count = math.log(smooth_count)
|
log_smooth_count = math.log(smooth_count)
|
||||||
probs[word] = math.log(smooth_count) - log_total
|
probs[word] = log_smooth_count - log_total
|
||||||
oov_prob = math.log(counts.smoother(0)) - log_total
|
oov_prob = math.log(counts.smoother(0)) - log_total
|
||||||
return probs, oov_prob
|
return probs, oov_prob
|
||||||
|
|
||||||
|
|
|
@ -54,7 +54,7 @@ def represent_word(word):
|
||||||
# Only do this if the lower-cased form is more probable.
|
# Only do this if the lower-cased form is more probable.
|
||||||
if text.istitle() \
|
if text.istitle() \
|
||||||
and is_sent_begin(word) \
|
and is_sent_begin(word) \
|
||||||
and word.prob < word.vocab[text.lower()].prob:
|
and word.prob < word.doc.vocab[text.lower()].prob:
|
||||||
text = text.lower()
|
text = text.lower()
|
||||||
return text + '|' + word.tag_
|
return text + '|' + word.tag_
|
||||||
|
|
||||||
|
|
|
@ -25,7 +25,7 @@ cdef class Tokenizer:
|
||||||
|
|
||||||
cdef int _try_cache(self, hash_t key, Doc tokens) except -1
|
cdef int _try_cache(self, hash_t key, Doc tokens) except -1
|
||||||
cdef int _tokenize(self, Doc tokens, unicode span, hash_t key) except -1
|
cdef int _tokenize(self, Doc tokens, unicode span, hash_t key) except -1
|
||||||
cdef unicode _split_affixes(self, unicode string, vector[LexemeC*] *prefixes,
|
cdef unicode _split_affixes(self, Pool mem, unicode string, vector[LexemeC*] *prefixes,
|
||||||
vector[LexemeC*] *suffixes)
|
vector[LexemeC*] *suffixes)
|
||||||
cdef int _attach_tokens(self, Doc tokens, unicode string,
|
cdef int _attach_tokens(self, Doc tokens, unicode string,
|
||||||
vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
|
vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
|
||||||
|
|
|
@ -155,11 +155,11 @@ cdef class Tokenizer:
|
||||||
cdef vector[LexemeC*] suffixes
|
cdef vector[LexemeC*] suffixes
|
||||||
cdef int orig_size
|
cdef int orig_size
|
||||||
orig_size = tokens.length
|
orig_size = tokens.length
|
||||||
span = self._split_affixes(span, &prefixes, &suffixes)
|
span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes)
|
||||||
self._attach_tokens(tokens, span, &prefixes, &suffixes)
|
self._attach_tokens(tokens, span, &prefixes, &suffixes)
|
||||||
self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size)
|
self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size)
|
||||||
|
|
||||||
cdef unicode _split_affixes(self, unicode string, vector[const LexemeC*] *prefixes,
|
cdef unicode _split_affixes(self, Pool mem, unicode string, vector[const LexemeC*] *prefixes,
|
||||||
vector[const LexemeC*] *suffixes):
|
vector[const LexemeC*] *suffixes):
|
||||||
cdef size_t i
|
cdef size_t i
|
||||||
cdef unicode prefix
|
cdef unicode prefix
|
||||||
|
@ -176,7 +176,7 @@ cdef class Tokenizer:
|
||||||
# Check whether we've hit a special-case
|
# Check whether we've hit a special-case
|
||||||
if minus_pre and self._specials.get(hash_string(minus_pre)) != NULL:
|
if minus_pre and self._specials.get(hash_string(minus_pre)) != NULL:
|
||||||
string = minus_pre
|
string = minus_pre
|
||||||
prefixes.push_back(self.vocab.get(self.vocab.mem, prefix))
|
prefixes.push_back(self.vocab.get(mem, prefix))
|
||||||
break
|
break
|
||||||
suf_len = self.find_suffix(string)
|
suf_len = self.find_suffix(string)
|
||||||
if suf_len != 0:
|
if suf_len != 0:
|
||||||
|
@ -185,18 +185,18 @@ cdef class Tokenizer:
|
||||||
# Check whether we've hit a special-case
|
# Check whether we've hit a special-case
|
||||||
if minus_suf and (self._specials.get(hash_string(minus_suf)) != NULL):
|
if minus_suf and (self._specials.get(hash_string(minus_suf)) != NULL):
|
||||||
string = minus_suf
|
string = minus_suf
|
||||||
suffixes.push_back(self.vocab.get(self.vocab.mem, suffix))
|
suffixes.push_back(self.vocab.get(mem, suffix))
|
||||||
break
|
break
|
||||||
if pre_len and suf_len and (pre_len + suf_len) <= len(string):
|
if pre_len and suf_len and (pre_len + suf_len) <= len(string):
|
||||||
string = string[pre_len:-suf_len]
|
string = string[pre_len:-suf_len]
|
||||||
prefixes.push_back(self.vocab.get(self.vocab.mem, prefix))
|
prefixes.push_back(self.vocab.get(mem, prefix))
|
||||||
suffixes.push_back(self.vocab.get(self.vocab.mem, suffix))
|
suffixes.push_back(self.vocab.get(mem, suffix))
|
||||||
elif pre_len:
|
elif pre_len:
|
||||||
string = minus_pre
|
string = minus_pre
|
||||||
prefixes.push_back(self.vocab.get(self.vocab.mem, prefix))
|
prefixes.push_back(self.vocab.get(mem, prefix))
|
||||||
elif suf_len:
|
elif suf_len:
|
||||||
string = minus_suf
|
string = minus_suf
|
||||||
suffixes.push_back(self.vocab.get(self.vocab.mem, suffix))
|
suffixes.push_back(self.vocab.get(mem, suffix))
|
||||||
if string and (self._specials.get(hash_string(string)) != NULL):
|
if string and (self._specials.get(hash_string(string)) != NULL):
|
||||||
break
|
break
|
||||||
return string
|
return string
|
||||||
|
|
Loading…
Reference in New Issue