spaCy/bin/init_model.py

"""Set up a model directory.

Requires:

    lang_data --- Rules for the tokenizer
        * prefix.txt
        * suffix.txt
        * infix.txt
        * morphs.json
        * specials.json

    corpora --- Data files
        * WordNet
        * words.sgt.prob --- Smoothed unigram probabilities
        * clusters.txt --- Output of hierarchical clustering, e.g. Brown clusters
        * vectors.tgz --- output of something like word2vec
"""
from __future__ import unicode_literals

from ast import literal_eval
import math

import plac
from pathlib import Path

from shutil import copyfile
from shutil import copytree
import codecs
from collections import defaultdict

from spacy.en import get_lex_props
from spacy.en.lemmatizer import Lemmatizer
from spacy.vocab import Vocab
from spacy.vocab import write_binary_vectors
from spacy.strings import hash_string
from preshed.counter import PreshCounter

from spacy.parts_of_speech import NOUN, VERB, ADJ


def setup_tokenizer(lang_data_dir, tok_dir):
    if not tok_dir.exists():
        tok_dir.mkdir()

    for filename in ('infix.txt', 'morphs.json', 'prefix.txt', 'specials.json',
                     'suffix.txt'):
        src = lang_data_dir / filename
        dst = tok_dir / filename
        if not dst.exists():
            copyfile(str(src), str(dst))


def _read_clusters(loc):
    if not loc.exists():
        print("Warning: Clusters file not found")
        return {}
    clusters = {}
    for line in codecs.open(str(loc), 'r', 'utf8'):
        try:
            cluster, word, freq = line.split()
        except ValueError:
            continue
        # If the clusterer has only seen the word a few times, its cluster is
        # unreliable.
        if int(freq) >= 3:
            clusters[word] = cluster
        else:
            clusters[word] = '0'
    # Expand clusters with re-casing
    for word, cluster in list(clusters.items()):
        if word.lower() not in clusters:
            clusters[word.lower()] = cluster
        if word.title() not in clusters:
            clusters[word.title()] = cluster
        if word.upper() not in clusters:
            clusters[word.upper()] = cluster
    return clusters


def _read_probs(loc):
    if not loc.exists():
        print("Warning: Probabilities file not found")
        return {}
    probs = {}
    for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')):
        prob, word = line.split()
        prob = float(prob)
        probs[word] = prob
    return probs


def _read_freqs(loc):
    if not loc.exists():
        print("Warning: Frequencies file not found")
        return None
    counts = PreshCounter()
    total = 0
    for i, line in enumerate(loc.open()):
        freq, doc_freq, key = line.split('\t', 2)
        freq = int(freq)
        counts.inc(i+1, freq)
        total += freq
    counts.smooth()
    log_total = math.log(total)
    probs = {}
    for line in loc.open():
        freq, doc_freq, key = line.split('\t', 2)
        if int(doc_freq) >= 2 and int(freq) >= 5 and len(key) < 200:
            word = literal_eval(key)
            smooth_count = counts.smoother(int(freq))
            log_smooth_count = math.log(smooth_count)
            probs[word] = math.log(smooth_count) - log_total
    probs['-OOV-'] = math.log(counts.smoother(0)) - log_total
    return probs


def _read_senses(loc):
    lexicon = defaultdict(lambda: defaultdict(list))
    if not loc.exists():
        print("Warning: WordNet senses not found")
        return lexicon
    sense_names = dict((s, i) for i, s in enumerate(spacy.senses.STRINGS))
    pos_ids = {'noun': NOUN, 'verb': VERB, 'adjective': ADJ}
    for line in codecs.open(str(loc), 'r', 'utf8'):
        sense_strings = line.split()
        word = sense_strings.pop(0)
        for sense in sense_strings:
            pos, sense = sense[3:].split('.')
            sense_name = '%s_%s' % (pos[0].upper(), sense.lower())
            if sense_name != 'N_tops':
                sense_id = sense_names[sense_name]
                lexicon[word][pos_ids[pos]].append(sense_id)
    return lexicon


def setup_vocab(src_dir, dst_dir):
    if not dst_dir.exists():
        dst_dir.mkdir()

    vectors_src = src_dir / 'vectors.tgz'
    if vectors_src.exists():
        write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
    else:
        print("Warning: Word vectors file not found")
    vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
    clusters = _read_clusters(src_dir / 'clusters.txt')
    probs = _read_probs(src_dir / 'words.sgt.prob')
    if not probs:
        probs = _read_freqs(src_dir / 'freqs.txt')
    if not probs:
        min_prob = 0.0
    else:
        min_prob = min(probs.values())
    for word in clusters:
        if word not in probs:
            probs[word] = min_prob

    lexicon = []
    for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
        entry = get_lex_props(word)
        if word in clusters:
            entry['prob'] = float(prob)
            cluster = clusters.get(word, '0')
            # Decode as a little-endian string, so that we can do & 15 to get
            # the first 4 bits. See _parse_features.pyx
            entry['cluster'] = int(cluster[::-1], 2)
            vocab[word] = entry
    vocab.dump(str(dst_dir / 'lexemes.bin'))
    vocab.strings.dump(str(dst_dir / 'strings.txt'))


def main(lang_data_dir, corpora_dir, model_dir):
    model_dir = Path(model_dir)
    lang_data_dir = Path(lang_data_dir)
    corpora_dir = Path(corpora_dir)

    assert corpora_dir.exists()
    assert lang_data_dir.exists()

    if not model_dir.exists():
        model_dir.mkdir()

    setup_tokenizer(lang_data_dir, model_dir / 'tokenizer')
    setup_vocab(corpora_dir, model_dir / 'vocab')
    if not (model_dir / 'wordnet').exists():
        copytree(str(corpora_dir / 'wordnet' / 'dict'), str(model_dir / 'wordnet'))


if __name__ == '__main__':
    plac.call(main)
* Add docstring explaining script a bit, and add handling of word vectors 2015-04-08 06:20:15 +00:00			`"""Set up a model directory.`

			`Requires:`

			`lang_data --- Rules for the tokenizer`
			`* prefix.txt`
			`* suffix.txt`
			`* infix.txt`
			`* morphs.json`
			`* specials.json`

			`corpora --- Data files`
			`* WordNet`
			`* words.sgt.prob --- Smoothed unigram probabilities`
			`* clusters.txt --- Output of hierarchical clustering, e.g. Brown clusters`
			`* vectors.tgz --- output of something like word2vec`
			`"""`
* Fix structure of wordnet directory for init_model 2015-07-23 04:35:38 +00:00			`from __future__ import unicode_literals`

* Add read_freqs function in init_model 2015-07-25 20:16:36 +00:00			`from ast import literal_eval`
* Fix init_model 2015-07-25 20:56:35 +00:00			`import math`
* Add read_freqs function in init_model 2015-07-25 20:16:36 +00:00
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 05:46:53 +00:00			`import plac`
			`from pathlib import Path`

			`from shutil import copyfile`
* Fix copying of tokenizer data in init_model 2015-04-12 02:45:31 +00:00			`from shutil import copytree`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 05:46:53 +00:00			`import codecs`
* Add supersense sets to lexemes, from WordNet. Look-up via lemmatization. 2015-07-01 16:48:59 +00:00			`from collections import defaultdict`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 05:46:53 +00:00
			`from spacy.en import get_lex_props`
* Add supersense sets to lexemes, from WordNet. Look-up via lemmatization. 2015-07-01 16:48:59 +00:00			`from spacy.en.lemmatizer import Lemmatizer`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 05:46:53 +00:00			`from spacy.vocab import Vocab`
* Add docstring explaining script a bit, and add handling of word vectors 2015-04-08 06:20:15 +00:00			`from spacy.vocab import write_binary_vectors`
* Add read_freqs function in init_model 2015-07-25 20:16:36 +00:00			`from spacy.strings import hash_string`
			`from preshed.counter import PreshCounter`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 05:46:53 +00:00
* Add supersense sets to lexemes, from WordNet. Look-up via lemmatization. 2015-07-01 16:48:59 +00:00			`from spacy.parts_of_speech import NOUN, VERB, ADJ`

* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 05:46:53 +00:00
			`def setup_tokenizer(lang_data_dir, tok_dir):`
			`if not tok_dir.exists():`
			`tok_dir.mkdir()`

			`for filename in ('infix.txt', 'morphs.json', 'prefix.txt', 'specials.json',`
			`'suffix.txt'):`
			`src = lang_data_dir / filename`
			`dst = tok_dir / filename`
			`if not dst.exists():`
* Fix copying of tokenizer data in init_model 2015-04-12 02:45:31 +00:00			`copyfile(str(src), str(dst))`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 05:46:53 +00:00

			`def _read_clusters(loc):`
* Update init_model, making language resources optional 2015-07-21 22:25:14 +00:00			`if not loc.exists():`
* Fix structure of wordnet directory for init_model 2015-07-23 04:35:38 +00:00			`print("Warning: Clusters file not found")`
* Update init_model, making language resources optional 2015-07-21 22:25:14 +00:00			`return {}`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 05:46:53 +00:00			`clusters = {}`
			`for line in codecs.open(str(loc), 'r', 'utf8'):`
			`try:`
			`cluster, word, freq = line.split()`
			`except ValueError:`
			`continue`
* Exclude clusterings for words only seen 1 or 2 times, as their clusters are unreliable 2015-04-17 02:44:52 +00:00			`# If the clusterer has only seen the word a few times, its cluster is`
			`# unreliable.`
			`if int(freq) >= 3:`
			`clusters[word] = cluster`
* Add cluster=0 by default in init_model 2015-04-29 12:23:13 +00:00			`else:`
			`clusters[word] = '0'`
* Add case expansion to Brown clusters 2015-05-31 03:50:50 +00:00			`# Expand clusters with re-casing`
Py3 compatibility tweak 2015-07-23 11:13:15 +00:00			`for word, cluster in list(clusters.items()):`
* Add case expansion to Brown clusters 2015-05-31 03:50:50 +00:00			`if word.lower() not in clusters:`
			`clusters[word.lower()] = cluster`
			`if word.title() not in clusters:`
			`clusters[word.title()] = cluster`
* Fix cluster initialization 2015-05-31 13:21:28 +00:00			`if word.upper() not in clusters:`
* Add case expansion to Brown clusters 2015-05-31 03:50:50 +00:00			`clusters[word.upper()] = cluster`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 05:46:53 +00:00			`return clusters`


			`def _read_probs(loc):`
* Update init_model, making language resources optional 2015-07-21 22:25:14 +00:00			`if not loc.exists():`
* Fix structure of wordnet directory for init_model 2015-07-23 04:35:38 +00:00			`print("Warning: Probabilities file not found")`
* Update init_model, making language resources optional 2015-07-21 22:25:14 +00:00			`return {}`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 05:46:53 +00:00			`probs = {}`
			`for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')):`
			`prob, word = line.split()`
			`prob = float(prob)`
			`probs[word] = prob`
			`return probs`


* Add read_freqs function in init_model 2015-07-25 20:16:36 +00:00			`def _read_freqs(loc):`
* Fix init_model 2015-07-25 20:54:08 +00:00			`if not loc.exists():`
			`print("Warning: Frequencies file not found")`
			`return None`
* Add read_freqs function in init_model 2015-07-25 20:16:36 +00:00			`counts = PreshCounter()`
			`total = 0`
* Fix init_model 2015-07-25 20:56:35 +00:00			`for i, line in enumerate(loc.open()):`
* Add read_freqs function in init_model 2015-07-25 20:16:36 +00:00			`freq, doc_freq, key = line.split('\t', 2)`
			`freq = int(freq)`
* Fix init_model 2015-07-25 20:56:35 +00:00			`counts.inc(i+1, freq)`
* Add read_freqs function in init_model 2015-07-25 20:16:36 +00:00			`total += freq`
			`counts.smooth()`
			`log_total = math.log(total)`
			`probs = {}`
* Fix init_model 2015-07-25 20:56:35 +00:00			`for line in loc.open():`
* Add read_freqs function in init_model 2015-07-25 20:16:36 +00:00			`freq, doc_freq, key = line.split('\t', 2)`
			`if int(doc_freq) >= 2 and int(freq) >= 5 and len(key) < 200:`
			`word = literal_eval(key)`
			`smooth_count = counts.smoother(int(freq))`
			`log_smooth_count = math.log(smooth_count)`
			`probs[word] = math.log(smooth_count) - log_total`
			`probs['-OOV-'] = math.log(counts.smoother(0)) - log_total`
			`return probs`


* Add supersense sets to lexemes, from WordNet. Look-up via lemmatization. 2015-07-01 16:48:59 +00:00			`def _read_senses(loc):`
			`lexicon = defaultdict(lambda: defaultdict(list))`
* Update init_model, making language resources optional 2015-07-21 22:25:14 +00:00			`if not loc.exists():`
* Fix structure of wordnet directory for init_model 2015-07-23 04:35:38 +00:00			`print("Warning: WordNet senses not found")`
* Update init_model, making language resources optional 2015-07-21 22:25:14 +00:00			`return lexicon`
* Add supersense sets to lexemes, from WordNet. Look-up via lemmatization. 2015-07-01 16:48:59 +00:00			`sense_names = dict((s, i) for i, s in enumerate(spacy.senses.STRINGS))`
			`pos_ids = {'noun': NOUN, 'verb': VERB, 'adjective': ADJ}`
			`for line in codecs.open(str(loc), 'r', 'utf8'):`
			`sense_strings = line.split()`
			`word = sense_strings.pop(0)`
			`for sense in sense_strings:`
			`pos, sense = sense[3:].split('.')`
			`sense_name = '%s_%s' % (pos[0].upper(), sense.lower())`
			`if sense_name != 'N_tops':`
			`sense_id = sense_names[sense_name]`
			`lexicon[word][pos_ids[pos]].append(sense_id)`
			`return lexicon`


* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 05:46:53 +00:00			`def setup_vocab(src_dir, dst_dir):`
			`if not dst_dir.exists():`
			`dst_dir.mkdir()`
* Add docstring explaining script a bit, and add handling of word vectors 2015-04-08 06:20:15 +00:00
			`vectors_src = src_dir / 'vectors.tgz'`
			`if vectors_src.exists():`
			`write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))`
* Update init_model, making language resources optional 2015-07-21 22:25:14 +00:00			`else:`
* Fix structure of wordnet directory for init_model 2015-07-23 04:35:38 +00:00			`print("Warning: Word vectors file not found")`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 05:46:53 +00:00			`vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)`
			`clusters = _read_clusters(src_dir / 'clusters.txt')`
			`probs = _read_probs(src_dir / 'words.sgt.prob')`
* Add read_freqs function in init_model 2015-07-25 20:16:36 +00:00			`if not probs:`
			`probs = _read_freqs(src_dir / 'freqs.txt')`
* Add cluster words to probs in init_model 2015-07-23 07:27:07 +00:00			`if not probs:`
			`min_prob = 0.0`
			`else:`
			`min_prob = min(probs.values())`
			`for word in clusters:`
			`if word not in probs:`
			`probs[word] = min_prob`

* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 05:46:53 +00:00			`lexicon = []`
Py3 compatibility tweak 2015-07-23 07:45:15 +00:00			`for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 05:46:53 +00:00			`entry = get_lex_props(word)`
* Remove probability cap on lexicon 2015-07-25 21:05:51 +00:00			`if word in clusters:`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 05:46:53 +00:00			`entry['prob'] = float(prob)`
			`cluster = clusters.get(word, '0')`
			`# Decode as a little-endian string, so that we can do & 15 to get`
			`# the first 4 bits. See _parse_features.pyx`
			`entry['cluster'] = int(cluster[::-1], 2)`
			`vocab[word] = entry`
			`vocab.dump(str(dst_dir / 'lexemes.bin'))`
			`vocab.strings.dump(str(dst_dir / 'strings.txt'))`


* Add docstring explaining script a bit, and add handling of word vectors 2015-04-08 06:20:15 +00:00			`def main(lang_data_dir, corpora_dir, model_dir):`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 05:46:53 +00:00			`model_dir = Path(model_dir)`
			`lang_data_dir = Path(lang_data_dir)`
* Add docstring explaining script a bit, and add handling of word vectors 2015-04-08 06:20:15 +00:00			`corpora_dir = Path(corpora_dir)`

			`assert corpora_dir.exists()`
			`assert lang_data_dir.exists()`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 05:46:53 +00:00
			`if not model_dir.exists():`
			`model_dir.mkdir()`

			`setup_tokenizer(lang_data_dir, model_dir / 'tokenizer')`
* Add docstring explaining script a bit, and add handling of word vectors 2015-04-08 06:20:15 +00:00			`setup_vocab(corpora_dir, model_dir / 'vocab')`
			`if not (model_dir / 'wordnet').exists():`
* Fix structure of wordnet directory for init_model 2015-07-23 04:35:38 +00:00			`copytree(str(corpora_dir / 'wordnet' / 'dict'), str(model_dir / 'wordnet'))`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 05:46:53 +00:00

			`if __name__ == '__main__':`
			`plac.call(main)`