update init_model.py to previous (better) state

This commit is contained in:
Wolfgang Seeker 2016-03-29 16:12:13 +02:00
parent 26622f0ffc
commit a8f4e49900
1 changed files with 6 additions and 8 deletions

View File

@ -98,7 +98,7 @@ def _read_probs(loc):
return probs, probs['-OOV-'] return probs, probs['-OOV-']
def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200): def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200):
if not loc.exists(): if not loc.exists():
print("Warning: Frequencies file not found") print("Warning: Frequencies file not found")
return {}, 0.0 return {}, 0.0
@ -109,7 +109,7 @@ def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200):
else: else:
file_ = loc.open() file_ = loc.open()
for i, line in enumerate(file_): for i, line in enumerate(file_):
freq, doc_freq, key = line.split('\t', 2) freq, doc_freq, key = line.rstrip().split('\t', 2)
freq = int(freq) freq = int(freq)
counts.inc(i+1, freq) counts.inc(i+1, freq)
total += freq total += freq
@ -121,15 +121,13 @@ def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200):
file_ = loc.open() file_ = loc.open()
probs = {} probs = {}
for line in file_: for line in file_:
freq, doc_freq, key = line.split('\t', 2) freq, doc_freq, key = line.rstrip().split('\t', 2)
doc_freq = int(doc_freq) doc_freq = int(doc_freq)
freq = int(freq) freq = int(freq)
if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
# word = literal_eval(key) word = literal_eval(key)
word = key
smooth_count = counts.smoother(int(freq)) smooth_count = counts.smoother(int(freq))
log_smooth_count = math.log(smooth_count) probs[word] = math.log(smooth_count) - log_total
probs[word] = log_smooth_count - log_total
oov_prob = math.log(counts.smoother(0)) - log_total oov_prob = math.log(counts.smoother(0)) - log_total
return probs, oov_prob return probs, oov_prob
@ -166,7 +164,7 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir):
clusters = _read_clusters(src_dir / 'clusters.txt') clusters = _read_clusters(src_dir / 'clusters.txt')
probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob') probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob')
if not probs: if not probs:
probs, oov_prob = _read_freqs(src_dir / 'freqs.txt') probs, oov_prob = _read_freqs(src_dir / 'freqs.txt.gz')
if not probs: if not probs:
oov_prob = -20 oov_prob = -20
else: else: