mirror of https://github.com/explosion/spaCy.git
Add support for .zip to init_model
This commit is contained in:
parent
5ecb274764
commit
7ee880a0ad
|
@ -10,17 +10,12 @@ from pathlib import Path
|
||||||
from preshed.counter import PreshCounter
|
from preshed.counter import PreshCounter
|
||||||
import tarfile
|
import tarfile
|
||||||
import gzip
|
import gzip
|
||||||
|
import zipfile
|
||||||
|
|
||||||
from ._messages import Messages
|
from ..compat import fix_text
|
||||||
from ..vectors import Vectors
|
from ..vectors import Vectors
|
||||||
from ..errors import Warnings, user_warning
|
|
||||||
from ..util import prints, ensure_path, get_lang_class
|
from ..util import prints, ensure_path, get_lang_class
|
||||||
|
|
||||||
try:
|
|
||||||
import ftfy
|
|
||||||
except ImportError:
|
|
||||||
ftfy = None
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
lang=("model language", "positional", None, str),
|
lang=("model language", "positional", None, str),
|
||||||
|
@ -39,13 +34,16 @@ def init_model(lang, output_dir, freqs_loc=None, clusters_loc=None, vectors_loc=
|
||||||
and word vectors.
|
and word vectors.
|
||||||
"""
|
"""
|
||||||
if freqs_loc is not None and not freqs_loc.exists():
|
if freqs_loc is not None and not freqs_loc.exists():
|
||||||
prints(freqs_loc, title=Messages.M037, exits=1)
|
prints(freqs_loc, title="Can't find words frequencies file", exits=1)
|
||||||
clusters_loc = ensure_path(clusters_loc)
|
clusters_loc = ensure_path(clusters_loc)
|
||||||
vectors_loc = ensure_path(vectors_loc)
|
vectors_loc = ensure_path(vectors_loc)
|
||||||
|
|
||||||
probs, oov_prob = read_freqs(freqs_loc) if freqs_loc is not None else ({}, -20)
|
probs, oov_prob = read_freqs(freqs_loc) if freqs_loc is not None else ({}, -20)
|
||||||
vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else (None, None)
|
vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else (None, None)
|
||||||
clusters = read_clusters(clusters_loc) if clusters_loc else {}
|
clusters = read_clusters(clusters_loc) if clusters_loc else {}
|
||||||
|
|
||||||
nlp = create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, prune_vectors)
|
nlp = create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, prune_vectors)
|
||||||
|
|
||||||
if not output_dir.exists():
|
if not output_dir.exists():
|
||||||
output_dir.mkdir()
|
output_dir.mkdir()
|
||||||
nlp.to_disk(output_dir)
|
nlp.to_disk(output_dir)
|
||||||
|
@ -54,20 +52,26 @@ def init_model(lang, output_dir, freqs_loc=None, clusters_loc=None, vectors_loc=
|
||||||
def open_file(loc):
|
def open_file(loc):
|
||||||
'''Handle .gz, .tar.gz or unzipped files'''
|
'''Handle .gz, .tar.gz or unzipped files'''
|
||||||
loc = ensure_path(loc)
|
loc = ensure_path(loc)
|
||||||
|
print("Open loc")
|
||||||
if tarfile.is_tarfile(str(loc)):
|
if tarfile.is_tarfile(str(loc)):
|
||||||
return tarfile.open(str(loc), 'r:gz')
|
return tarfile.open(str(loc), 'r:gz')
|
||||||
elif loc.parts[-1].endswith('gz'):
|
elif loc.parts[-1].endswith('gz'):
|
||||||
return (line.decode('utf8') for line in gzip.open(str(loc), 'r'))
|
return (line.decode('utf8') for line in gzip.open(str(loc), 'r'))
|
||||||
|
elif loc.parts[-1].endswith('zip'):
|
||||||
|
zip_file = zipfile.ZipFile(str(loc))
|
||||||
|
names = zip_file.namelist()
|
||||||
|
file_ = zip_file.open(names[0])
|
||||||
|
return (line.decode('utf8') for line in file_)
|
||||||
else:
|
else:
|
||||||
return loc.open('r', encoding='utf8')
|
return loc.open('r', encoding='utf8')
|
||||||
|
|
||||||
|
|
||||||
def create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, prune_vectors):
|
def create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, prune_vectors):
|
||||||
print("Creating model...")
|
print("Creating model...")
|
||||||
lang_class = get_lang_class(lang)
|
lang_class = get_lang_class(lang)
|
||||||
nlp = lang_class()
|
nlp = lang_class()
|
||||||
for lexeme in nlp.vocab:
|
for lexeme in nlp.vocab:
|
||||||
lexeme.rank = 0
|
lexeme.rank = 0
|
||||||
|
|
||||||
lex_added = 0
|
lex_added = 0
|
||||||
for i, (word, prob) in enumerate(tqdm(sorted(probs.items(), key=lambda item: item[1], reverse=True))):
|
for i, (word, prob) in enumerate(tqdm(sorted(probs.items(), key=lambda item: item[1], reverse=True))):
|
||||||
lexeme = nlp.vocab[word]
|
lexeme = nlp.vocab[word]
|
||||||
|
@ -87,13 +91,15 @@ def create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, pru
|
||||||
lexeme = nlp.vocab[word]
|
lexeme = nlp.vocab[word]
|
||||||
lexeme.is_oov = False
|
lexeme.is_oov = False
|
||||||
lex_added += 1
|
lex_added += 1
|
||||||
|
|
||||||
if len(vectors_data):
|
if len(vectors_data):
|
||||||
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
||||||
if prune_vectors >= 1:
|
if prune_vectors >= 1:
|
||||||
nlp.vocab.prune_vectors(prune_vectors)
|
nlp.vocab.prune_vectors(prune_vectors)
|
||||||
vec_added = len(nlp.vocab.vectors)
|
vec_added = len(nlp.vocab.vectors)
|
||||||
prints(Messages.M039.format(entries=lex_added, vectors=vec_added),
|
|
||||||
title=Messages.M038)
|
prints("{} entries, {} vectors".format(lex_added, vec_added),
|
||||||
|
title="Sucessfully compiled vocab")
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
|
@ -104,8 +110,12 @@ def read_vectors(vectors_loc):
|
||||||
vectors_data = numpy.zeros(shape=shape, dtype='f')
|
vectors_data = numpy.zeros(shape=shape, dtype='f')
|
||||||
vectors_keys = []
|
vectors_keys = []
|
||||||
for i, line in enumerate(tqdm(f)):
|
for i, line in enumerate(tqdm(f)):
|
||||||
pieces = line.split()
|
line = line.rstrip()
|
||||||
|
pieces = line.rsplit(' ', vectors_data.shape[1]+1)
|
||||||
word = pieces.pop(0)
|
word = pieces.pop(0)
|
||||||
|
if len(pieces) != vectors_data.shape[1]:
|
||||||
|
print(word, repr(line))
|
||||||
|
raise ValueError("Bad line in file")
|
||||||
vectors_data[i] = numpy.asarray(pieces, dtype='f')
|
vectors_data[i] = numpy.asarray(pieces, dtype='f')
|
||||||
vectors_keys.append(word)
|
vectors_keys.append(word)
|
||||||
return vectors_data, vectors_keys
|
return vectors_data, vectors_keys
|
||||||
|
@ -140,14 +150,11 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
|
||||||
def read_clusters(clusters_loc):
|
def read_clusters(clusters_loc):
|
||||||
print("Reading clusters...")
|
print("Reading clusters...")
|
||||||
clusters = {}
|
clusters = {}
|
||||||
if ftfy is None:
|
|
||||||
user_warning(Warnings.W004)
|
|
||||||
with clusters_loc.open() as f:
|
with clusters_loc.open() as f:
|
||||||
for line in tqdm(f):
|
for line in tqdm(f):
|
||||||
try:
|
try:
|
||||||
cluster, word, freq = line.split()
|
cluster, word, freq = line.split()
|
||||||
if ftfy is not None:
|
word = fix_text(word)
|
||||||
word = ftfy.fix_text(word)
|
|
||||||
except ValueError:
|
except ValueError:
|
||||||
continue
|
continue
|
||||||
# If the clusterer has only seen the word a few times, its
|
# If the clusterer has only seen the word a few times, its
|
||||||
|
|
Loading…
Reference in New Issue