spaCy/bin/init_model.py

"""Set up a model directory.

Requires:

    lang_data --- Rules for the tokenizer
        * prefix.txt
        * suffix.txt
        * infix.txt
        * morphs.json
        * specials.json

    corpora --- Data files
        * WordNet
        * words.sgt.prob --- Smoothed unigram probabilities
        * clusters.txt --- Output of hierarchical clustering, e.g. Brown clusters
        * vectors.tgz --- output of something like word2vec
"""
import plac
from pathlib import Path

from shutil import copyfile
from shutil import copytree
import codecs

from spacy.en import get_lex_props
from spacy.vocab import Vocab
from spacy.vocab import write_binary_vectors


def setup_tokenizer(lang_data_dir, tok_dir):
    if not tok_dir.exists():
        tok_dir.mkdir()

    for filename in ('infix.txt', 'morphs.json', 'prefix.txt', 'specials.json',
                     'suffix.txt'):
        src = lang_data_dir / filename
        dst = tok_dir / filename
        if not dst.exists():
            copyfile(str(src), str(dst))


def _read_clusters(loc):
    clusters = {}
    for line in codecs.open(str(loc), 'r', 'utf8'):
        try:
            cluster, word, freq = line.split()
        except ValueError:
            continue
        # If the clusterer has only seen the word a few times, its cluster is
        # unreliable.
        if int(freq) >= 3:
            clusters[word] = cluster
        else:
            clusters[word] = '0'
    return clusters


def _read_probs(loc):
    probs = {}
    for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')):
        prob, word = line.split()
        prob = float(prob)
        probs[word] = prob
    return probs


def setup_vocab(src_dir, dst_dir):
    if not dst_dir.exists():
        dst_dir.mkdir()

    vectors_src = src_dir / 'vectors.tgz'
    if vectors_src.exists():
        write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
    vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
    clusters = _read_clusters(src_dir / 'clusters.txt')
    probs = _read_probs(src_dir / 'words.sgt.prob')
    for word in clusters:
        if word not in probs:
            probs[word] = -17.0
    lexicon = []
    for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])):
        entry = get_lex_props(word)
        if word in clusters or float(prob) >= -17:
            entry['prob'] = float(prob)
            cluster = clusters.get(word, '0')
            # Decode as a little-endian string, so that we can do & 15 to get
            # the first 4 bits. See _parse_features.pyx
            entry['cluster'] = int(cluster[::-1], 2)
            vocab[word] = entry
    vocab.dump(str(dst_dir / 'lexemes.bin'))
    vocab.strings.dump(str(dst_dir / 'strings.txt'))


def main(lang_data_dir, corpora_dir, model_dir):
    model_dir = Path(model_dir)
    lang_data_dir = Path(lang_data_dir)
    corpora_dir = Path(corpora_dir)

    assert corpora_dir.exists()
    assert lang_data_dir.exists()

    if not model_dir.exists():
        model_dir.mkdir()

    setup_tokenizer(lang_data_dir, model_dir / 'tokenizer')
    setup_vocab(corpora_dir, model_dir / 'vocab')
    if not (model_dir / 'wordnet').exists():
        copytree(str(corpora_dir / 'wordnet'), str(model_dir / 'wordnet'))


if __name__ == '__main__':
    plac.call(main)
* Add docstring explaining script a bit, and add handling of word vectors 2015-04-08 06:20:15 +00:00			`"""Set up a model directory.`

			`Requires:`

			`lang_data --- Rules for the tokenizer`
			`* prefix.txt`
			`* suffix.txt`
			`* infix.txt`
			`* morphs.json`
			`* specials.json`

			`corpora --- Data files`
			`* WordNet`
			`* words.sgt.prob --- Smoothed unigram probabilities`
			`* clusters.txt --- Output of hierarchical clustering, e.g. Brown clusters`
			`* vectors.tgz --- output of something like word2vec`
			`"""`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 05:46:53 +00:00			`import plac`
			`from pathlib import Path`

			`from shutil import copyfile`
* Fix copying of tokenizer data in init_model 2015-04-12 02:45:31 +00:00			`from shutil import copytree`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 05:46:53 +00:00			`import codecs`

			`from spacy.en import get_lex_props`
			`from spacy.vocab import Vocab`
* Add docstring explaining script a bit, and add handling of word vectors 2015-04-08 06:20:15 +00:00			`from spacy.vocab import write_binary_vectors`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 05:46:53 +00:00

			`def setup_tokenizer(lang_data_dir, tok_dir):`
			`if not tok_dir.exists():`
			`tok_dir.mkdir()`

			`for filename in ('infix.txt', 'morphs.json', 'prefix.txt', 'specials.json',`
			`'suffix.txt'):`
			`src = lang_data_dir / filename`
			`dst = tok_dir / filename`
			`if not dst.exists():`
* Fix copying of tokenizer data in init_model 2015-04-12 02:45:31 +00:00			`copyfile(str(src), str(dst))`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 05:46:53 +00:00

			`def _read_clusters(loc):`
			`clusters = {}`
			`for line in codecs.open(str(loc), 'r', 'utf8'):`
			`try:`
			`cluster, word, freq = line.split()`
			`except ValueError:`
			`continue`
* Exclude clusterings for words only seen 1 or 2 times, as their clusters are unreliable 2015-04-17 02:44:52 +00:00			`# If the clusterer has only seen the word a few times, its cluster is`
			`# unreliable.`
			`if int(freq) >= 3:`
			`clusters[word] = cluster`
* Add cluster=0 by default in init_model 2015-04-29 12:23:13 +00:00			`else:`
			`clusters[word] = '0'`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 05:46:53 +00:00			`return clusters`


			`def _read_probs(loc):`
			`probs = {}`
			`for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')):`
			`prob, word = line.split()`
			`prob = float(prob)`
			`probs[word] = prob`
			`return probs`


			`def setup_vocab(src_dir, dst_dir):`
			`if not dst_dir.exists():`
			`dst_dir.mkdir()`
* Add docstring explaining script a bit, and add handling of word vectors 2015-04-08 06:20:15 +00:00
			`vectors_src = src_dir / 'vectors.tgz'`
			`if vectors_src.exists():`
			`write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 05:46:53 +00:00			`vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)`
			`clusters = _read_clusters(src_dir / 'clusters.txt')`
			`probs = _read_probs(src_dir / 'words.sgt.prob')`
* Ensure words in Brown clusters make it into the vocab, even if they're not in our probs list 2015-05-31 03:46:16 +00:00			`for word in clusters:`
			`if word not in probs:`
			`probs[word] = -17.0`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 05:46:53 +00:00			`lexicon = []`
			`for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])):`
			`entry = get_lex_props(word)`
			`if word in clusters or float(prob) >= -17:`
			`entry['prob'] = float(prob)`
			`cluster = clusters.get(word, '0')`
			`# Decode as a little-endian string, so that we can do & 15 to get`
			`# the first 4 bits. See _parse_features.pyx`
			`entry['cluster'] = int(cluster[::-1], 2)`
			`vocab[word] = entry`
			`vocab.dump(str(dst_dir / 'lexemes.bin'))`
			`vocab.strings.dump(str(dst_dir / 'strings.txt'))`


* Add docstring explaining script a bit, and add handling of word vectors 2015-04-08 06:20:15 +00:00			`def main(lang_data_dir, corpora_dir, model_dir):`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 05:46:53 +00:00			`model_dir = Path(model_dir)`
			`lang_data_dir = Path(lang_data_dir)`
* Add docstring explaining script a bit, and add handling of word vectors 2015-04-08 06:20:15 +00:00			`corpora_dir = Path(corpora_dir)`

			`assert corpora_dir.exists()`
			`assert lang_data_dir.exists()`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 05:46:53 +00:00
			`if not model_dir.exists():`
			`model_dir.mkdir()`

			`setup_tokenizer(lang_data_dir, model_dir / 'tokenizer')`
* Add docstring explaining script a bit, and add handling of word vectors 2015-04-08 06:20:15 +00:00			`setup_vocab(corpora_dir, model_dir / 'vocab')`
			`if not (model_dir / 'wordnet').exists():`
			`copytree(str(corpora_dir / 'wordnet'), str(model_dir / 'wordnet'))`
* Add new script to replace make_lexicon, that does full setup of data 2015-04-08 05:46:53 +00:00

			`if __name__ == '__main__':`
			`plac.call(main)`