revert init_model.py back to pre-german state (because it makes more sense)

simplify token.n_rights and token.n_lefts
2016-03-21 16:10:25 +01:00 · 2016-03-21 16:10:25 +01:00 · 5080077097
parent 5e2e8e951a
commit 5080077097
2 changed files with 5 additions and 18 deletions
--- a/bin/init_model.py
+++ b/bin/init_model.py
@ -98,7 +98,7 @@ def _read_probs(loc):
    return probs, probs['-OOV-']


-def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200):
+def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200):
    if not loc.exists():
        print("Warning: Frequencies file not found")
        return {}, 0.0
@ -125,8 +125,7 @@ def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200):
        doc_freq = int(doc_freq)
        freq = int(freq)
        if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
-#            word = literal_eval(key)
-            word = key
+            word = literal_eval(key)
            smooth_count = counts.smoother(int(freq))
            log_smooth_count = math.log(smooth_count)
            probs[word] = math.log(smooth_count) - log_total
@ -166,7 +165,7 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir):
    clusters = _read_clusters(src_dir / 'clusters.txt')
    probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob')
    if not probs:
-        probs, oov_prob = _read_freqs(src_dir / 'freqs.txt')
+        probs, oov_prob = _read_freqs(src_dir / 'freqs.txt.gz')
    if not probs:
        oov_prob = -20
    else:
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -179,23 +179,11 @@ cdef class Token:

    property n_lefts:
        def __get__(self):
-            cdef int n = 0
-            cdef const TokenC* ptr = self.c - self.i
-            while ptr != self.c:
-                if ptr + ptr.head == self.c:
-                    n += 1
-                ptr += 1
-            return n
+            return self.c.l_kids

    property n_rights:
        def __get__(self):
-            cdef int n = 0
-            cdef const TokenC* ptr = self.c + (self.array_len - self.i)
-            while ptr != self.c:
-                if ptr + ptr.head == self.c:
-                    n += 1
-                ptr -= 1
-            return n
+            return self.c.r_kids

    property lefts:
        def __get__(self):