2019-03-12 11:07:56 +00:00
|
|
|
#!/usr/bin/env python
|
|
|
|
from __future__ import print_function, unicode_literals, division
|
|
|
|
|
|
|
|
import logging
|
|
|
|
from pathlib import Path
|
|
|
|
from collections import defaultdict
|
|
|
|
from gensim.models import Word2Vec
|
|
|
|
import plac
|
|
|
|
import spacy
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
class Corpus(object):
|
2019-06-16 11:24:06 +00:00
|
|
|
def __init__(self, directory, nlp):
|
2019-03-12 11:07:56 +00:00
|
|
|
self.directory = directory
|
2019-06-16 11:24:06 +00:00
|
|
|
self.nlp = nlp
|
2019-03-12 11:07:56 +00:00
|
|
|
|
|
|
|
def __iter__(self):
|
|
|
|
for text_loc in iter_dir(self.directory):
|
|
|
|
with text_loc.open("r", encoding="utf-8") as file_:
|
|
|
|
text = file_.read()
|
2019-06-16 11:24:06 +00:00
|
|
|
|
|
|
|
# This is to keep the input to the blank model (which doesn't
|
|
|
|
# sentencize) from being too long. It works particularly well with
|
|
|
|
# the output of [WikiExtractor](https://github.com/attardi/wikiextractor)
|
|
|
|
paragraphs = text.split('\n\n')
|
|
|
|
for par in paragraphs:
|
|
|
|
yield [word.orth_ for word in self.nlp(par)]
|
2019-03-12 11:07:56 +00:00
|
|
|
|
|
|
|
|
|
|
|
def iter_dir(loc):
|
|
|
|
dir_path = Path(loc)
|
|
|
|
for fn_path in dir_path.iterdir():
|
|
|
|
if fn_path.is_dir():
|
|
|
|
for sub_path in fn_path.iterdir():
|
|
|
|
yield sub_path
|
|
|
|
else:
|
|
|
|
yield fn_path
|
|
|
|
|
|
|
|
|
|
|
|
@plac.annotations(
|
|
|
|
lang=("ISO language code"),
|
|
|
|
in_dir=("Location of input directory"),
|
|
|
|
out_loc=("Location of output file"),
|
|
|
|
n_workers=("Number of workers", "option", "n", int),
|
|
|
|
size=("Dimension of the word vectors", "option", "d", int),
|
|
|
|
window=("Context window size", "option", "w", int),
|
|
|
|
min_count=("Min count", "option", "m", int),
|
|
|
|
negative=("Number of negative samples", "option", "g", int),
|
|
|
|
nr_iter=("Number of iterations", "option", "i", int),
|
|
|
|
)
|
|
|
|
def main(
|
|
|
|
lang,
|
|
|
|
in_dir,
|
|
|
|
out_loc,
|
|
|
|
negative=5,
|
|
|
|
n_workers=4,
|
|
|
|
window=5,
|
|
|
|
size=128,
|
|
|
|
min_count=10,
|
2019-06-16 11:24:06 +00:00
|
|
|
nr_iter=5,
|
2019-03-12 11:07:56 +00:00
|
|
|
):
|
|
|
|
logging.basicConfig(
|
|
|
|
format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO
|
|
|
|
)
|
2019-06-16 11:24:06 +00:00
|
|
|
nlp = spacy.blank(lang)
|
|
|
|
corpus = Corpus(in_dir, nlp)
|
2019-03-12 11:07:56 +00:00
|
|
|
model = Word2Vec(
|
2019-06-16 11:24:06 +00:00
|
|
|
sentences=corpus,
|
2019-03-12 11:07:56 +00:00
|
|
|
size=size,
|
|
|
|
window=window,
|
|
|
|
min_count=min_count,
|
|
|
|
workers=n_workers,
|
|
|
|
sample=1e-5,
|
|
|
|
negative=negative,
|
|
|
|
)
|
|
|
|
model.save(out_loc)
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
plac.call(main)
|