From a8f4e4990096be4be6922761aae7c73c7f3ce80e Mon Sep 17 00:00:00 2001 From: Wolfgang Seeker Date: Tue, 29 Mar 2016 16:12:13 +0200 Subject: [PATCH] update init_model.py to previous (better) state --- bin/init_model.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/bin/init_model.py b/bin/init_model.py index b14015b39..3bbd7c469 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -98,7 +98,7 @@ def _read_probs(loc): return probs, probs['-OOV-'] -def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200): +def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200): if not loc.exists(): print("Warning: Frequencies file not found") return {}, 0.0 @@ -109,7 +109,7 @@ def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200): else: file_ = loc.open() for i, line in enumerate(file_): - freq, doc_freq, key = line.split('\t', 2) + freq, doc_freq, key = line.rstrip().split('\t', 2) freq = int(freq) counts.inc(i+1, freq) total += freq @@ -121,15 +121,13 @@ def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200): file_ = loc.open() probs = {} for line in file_: - freq, doc_freq, key = line.split('\t', 2) + freq, doc_freq, key = line.rstrip().split('\t', 2) doc_freq = int(doc_freq) freq = int(freq) if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: -# word = literal_eval(key) - word = key + word = literal_eval(key) smooth_count = counts.smoother(int(freq)) - log_smooth_count = math.log(smooth_count) - probs[word] = log_smooth_count - log_total + probs[word] = math.log(smooth_count) - log_total oov_prob = math.log(counts.smoother(0)) - log_total return probs, oov_prob @@ -166,7 +164,7 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir): clusters = _read_clusters(src_dir / 'clusters.txt') probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob') if not probs: - probs, oov_prob = _read_freqs(src_dir / 'freqs.txt') + probs, oov_prob = _read_freqs(src_dir / 'freqs.txt.gz') if not probs: oov_prob = -20 else: