* Remove sense stuff from init_model

2015-07-14 10:56:17 +02:00 · 2015-07-14 10:56:17 +02:00 · af54d05d60
parent 3de1b3ef1d
commit af54d05d60
1 changed files with 0 additions and 15 deletions
--- a/bin/init_model.py
+++ b/bin/init_model.py
@ -30,8 +30,6 @@ from spacy.vocab import write_binary_vectors
 from spacy.parts_of_speech import NOUN, VERB, ADJ
 import spacy.senses
 def setup_tokenizer(lang_data_dir, tok_dir):
    if not tok_dir.exists():
@ -103,11 +101,7 @@ def setup_vocab(src_dir, dst_dir):
        write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
    vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
    clusters = _read_clusters(src_dir / 'clusters.txt')
    senses = _read_senses(src_dir / 'supersenses.txt')
    probs = _read_probs(src_dir / 'words.sgt.prob')
    for word in set(clusters).union(set(senses)):
        if word not in probs:
            probs[word] = -17.0
    lemmatizer = Lemmatizer(str(src_dir / 'wordnet'), NOUN, VERB, ADJ)
    lexicon = []
    for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])):
@ -120,15 +114,6 @@ def setup_vocab(src_dir, dst_dir):
            entry['cluster'] = int(cluster[::-1], 2)
            orth_senses = set()
            lemmas = []
            for pos in [NOUN, VERB, ADJ]:
                for lemma in lemmatizer(word.lower(), pos):
                    lemmas.append(lemma)
                    orth_senses.update(senses[lemma][pos])
            if word.lower() == 'dogging':
                print word
                print lemmas
                print [spacy.senses.STRINGS[si] for si in orth_senses]
            entry['senses'] = list(sorted(orth_senses))
            vocab[word] = entry
    vocab.dump(str(dst_dir / 'lexemes.bin'))
    vocab.strings.dump(str(dst_dir / 'strings.txt'))