diff --git a/spacy/en/download.py b/spacy/en/download.py new file mode 100644 index 000000000..9f74f0620 --- /dev/null +++ b/spacy/en/download.py @@ -0,0 +1,37 @@ +from os import path +import os +import tarfile +import shutil +import requests + +URL = 'https://s3-us-west-1.amazonaws.com/media.spacynlp.com/en.tgz' + +DEST_DIR = path.join(path.dirname(__file__), 'data', 'deps') + +def download_file(url): + local_filename = url.split('/')[-1] + # NOTE the stream=True parameter + r = requests.get(url, stream=True) + with open(local_filename, 'wb') as f: + for chunk in r.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + f.write(chunk) + f.flush() + return local_filename + +def main(): + if not os.path.exists(DEST_DIR): + os.mkdir(DEST_DIR) + assert not path.exists(path.join(DEST_DIR, 'en')) + + + filename = download_file(URL) + t = tarfile.open(filename, mode=":gz") + t.extractall(DEST_DIR) + shutil.move(path.join(DEST_DIR, 'en', 'deps', 'model'), DEST_DIR) + shutil.move(path.join(DEST_DIR, 'en', 'deps', 'config.json'), DEST_DIR) + shutil.rmtree(path.join(DEST_DIR, 'en')) + + +if __name__ == '__main__': + main()