diff --git a/bin/init_model.py b/bin/init_model.py index 5e62a7faf..4d7611cce 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -98,7 +98,7 @@ def _read_probs(loc): return probs, probs['-OOV-'] -def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200): +def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200): if not loc.exists(): print("Warning: Frequencies file not found") return {}, 0.0 @@ -125,8 +125,7 @@ def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200): doc_freq = int(doc_freq) freq = int(freq) if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: -# word = literal_eval(key) - word = key + word = literal_eval(key) smooth_count = counts.smoother(int(freq)) log_smooth_count = math.log(smooth_count) probs[word] = math.log(smooth_count) - log_total @@ -166,7 +165,7 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir): clusters = _read_clusters(src_dir / 'clusters.txt') probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob') if not probs: - probs, oov_prob = _read_freqs(src_dir / 'freqs.txt') + probs, oov_prob = _read_freqs(src_dir / 'freqs.txt.gz') if not probs: oov_prob = -20 else: diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 17d756b3e..68ce2ffb5 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -179,23 +179,11 @@ cdef class Token: property n_lefts: def __get__(self): - cdef int n = 0 - cdef const TokenC* ptr = self.c - self.i - while ptr != self.c: - if ptr + ptr.head == self.c: - n += 1 - ptr += 1 - return n + return self.c.l_kids property n_rights: def __get__(self): - cdef int n = 0 - cdef const TokenC* ptr = self.c + (self.array_len - self.i) - while ptr != self.c: - if ptr + ptr.head == self.c: - n += 1 - ptr -= 1 - return n + return self.c.r_kids property lefts: def __get__(self):