* Add cluster words to probs in init_model

This commit is contained in:
Matthew Honnibal 2015-07-23 09:27:07 +02:00
parent bee2e77983
commit da4821fc14
1 changed files with 8 additions and 0 deletions

View File

@ -115,6 +115,14 @@ def setup_vocab(src_dir, dst_dir):
vocab = Vocab(data_dir=None, get_lex_props=get_lex_props) vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
clusters = _read_clusters(src_dir / 'clusters.txt') clusters = _read_clusters(src_dir / 'clusters.txt')
probs = _read_probs(src_dir / 'words.sgt.prob') probs = _read_probs(src_dir / 'words.sgt.prob')
if not probs:
min_prob = 0.0
else:
min_prob = min(probs.values())
for word in clusters:
if word not in probs:
probs[word] = min_prob
lexicon = [] lexicon = []
for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])): for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])):
entry = get_lex_props(word) entry = get_lex_props(word)