From c0c5f319508f93fe66da795b6f6dc5b34c322b91 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 8 Dec 2016 20:39:49 +0100 Subject: [PATCH] Remove unused data and download script --- spacy/en/regexes.py | 47 --------- spacy/en/uget.py | 246 -------------------------------------------- 2 files changed, 293 deletions(-) delete mode 100644 spacy/en/regexes.py delete mode 100644 spacy/en/uget.py diff --git a/spacy/en/regexes.py b/spacy/en/regexes.py deleted file mode 100644 index 98e745239..000000000 --- a/spacy/en/regexes.py +++ /dev/null @@ -1,47 +0,0 @@ -import re - - -_mw_prepositions = [ - 'close to', - 'down by', - 'on the way to', - 'on my way to', - 'on my way', - 'on his way to', - 'on his way', - 'on her way to', - 'on her way', - 'on your way to', - 'on your way', - 'on our way to', - 'on our way', - 'on their way to', - 'on their way', - 'along the route from' -] - - -MW_PREPOSITIONS_RE = re.compile('|'.join(_mw_prepositions), flags=re.IGNORECASE) - - -TIME_RE = re.compile( - '{colon_digits}|{colon_digits} ?{am_pm}?|{one_two_digits} ?({am_pm})'.format( - colon_digits=r'[0-2]?[0-9]:[0-5][0-9](?::[0-5][0-9])?', - one_two_digits=r'[0-2]?[0-9]', - am_pm=r'[ap]\.?m\.?')) - -DATE_RE = re.compile( - '(?:this|last|next|the) (?:week|weekend|{days})'.format( - days='Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday' - )) - - -MONEY_RE = re.compile('\$\d+(?:\.\d+)?|\d+ dollars(?: \d+ cents)?') - - -DAYS_RE = re.compile('Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday') - - -REGEXES = [('IN', 'O', MW_PREPOSITIONS_RE), ('CD', 'TIME', TIME_RE), - ('NNP', 'DATE', DATE_RE), - ('NNP', 'DATE', DAYS_RE), ('CD', 'MONEY', MONEY_RE)] diff --git a/spacy/en/uget.py b/spacy/en/uget.py deleted file mode 100644 index 3cdf6c8d6..000000000 --- a/spacy/en/uget.py +++ /dev/null @@ -1,246 +0,0 @@ -import os -import time -import io -import math -import re - -try: - from urllib.parse import urlparse - from urllib.request import urlopen, Request - from urllib.error import HTTPError -except ImportError: - from urllib2 import urlopen, urlparse, Request, HTTPError - - -class UnknownContentLengthException(Exception): pass -class InvalidChecksumException(Exception): pass -class UnsupportedHTTPCodeException(Exception): pass -class InvalidOffsetException(Exception): pass -class MissingChecksumHeader(Exception): pass - - -CHUNK_SIZE = 16 * 1024 - - -class RateSampler(object): - def __init__(self, period=1): - self.rate = None - self.reset = True - self.period = period - - def __enter__(self): - if self.reset: - self.reset = False - self.start = time.time() - self.counter = 0 - - def __exit__(self, type, value, traceback): - elapsed = time.time() - self.start - if elapsed >= self.period: - self.reset = True - self.rate = float(self.counter) / elapsed - - def update(self, value): - self.counter += value - - def format(self, unit="MB"): - if self.rate is None: - return None - - divisor = {'MB': 1048576, 'kB': 1024} - return "%0.2f%s/s" % (self.rate / divisor[unit], unit) - - -class TimeEstimator(object): - def __init__(self, cooldown=1): - self.cooldown = cooldown - self.start = time.time() - self.time_left = None - - def update(self, bytes_read, total_size): - elapsed = time.time() - self.start - if elapsed > self.cooldown: - self.time_left = math.ceil(elapsed * total_size / - bytes_read - elapsed) - - def format(self): - if self.time_left is None: - return None - - res = "eta " - if self.time_left / 60 >= 1: - res += "%dm " % (self.time_left / 60) - return res + "%ds" % (self.time_left % 60) - - -def format_bytes_read(bytes_read, unit="MB"): - divisor = {'MB': 1048576, 'kB': 1024} - return "%0.2f%s" % (float(bytes_read) / divisor[unit], unit) - - -def format_percent(bytes_read, total_size): - percent = round(bytes_read * 100.0 / total_size, 2) - return "%0.2f%%" % percent - - -def get_content_range(response): - content_range = response.headers.get('Content-Range', "").strip() - if content_range: - m = re.match(r"bytes (\d+)-(\d+)/(\d+)", content_range) - if m: - return [int(v) for v in m.groups()] - - -def get_content_length(response): - if 'Content-Length' not in response.headers: - raise UnknownContentLengthException - return int(response.headers.get('Content-Length').strip()) - - -def get_url_meta(url, checksum_header=None): - class HeadRequest(Request): - def get_method(self): - return "HEAD" - - r = urlopen(HeadRequest(url)) - res = {'size': get_content_length(r)} - - if checksum_header: - value = r.headers.get(checksum_header) - if value: - res['checksum'] = value - - r.close() - return res - - -def progress(console, bytes_read, total_size, transfer_rate, eta): - fields = [ - format_bytes_read(bytes_read), - format_percent(bytes_read, total_size), - transfer_rate.format(), - eta.format(), - " " * 10, - ] - console.write("Downloaded %s\r" % " ".join(filter(None, fields))) - console.flush() - - -def read_request(request, offset=0, console=None, - progress_func=None, write_func=None): - # support partial downloads - if offset > 0: - request.add_header('Range', "bytes=%s-" % offset) - - try: - response = urlopen(request) - except HTTPError as e: - if e.code == 416: # Requested Range Not Satisfiable - raise InvalidOffsetException - - # TODO add http error handling here - raise UnsupportedHTTPCodeException(e.code) - - total_size = get_content_length(response) + offset - bytes_read = offset - - # sanity checks - if response.code == 200: # OK - assert offset == 0 - elif response.code == 206: # Partial content - range_start, range_end, range_total = get_content_range(response) - assert range_start == offset - assert range_total == total_size - assert range_end + 1 - range_start == total_size - bytes_read - else: - raise UnsupportedHTTPCodeException(response.code) - - eta = TimeEstimator() - transfer_rate = RateSampler() - - if console: - if offset > 0: - console.write("Continue downloading...\n") - else: - console.write("Downloading...\n") - - while True: - with transfer_rate: - chunk = response.read(CHUNK_SIZE) - if not chunk: - if progress_func and console: - console.write('\n') - break - - bytes_read += len(chunk) - - transfer_rate.update(len(chunk)) - eta.update(bytes_read - offset, total_size - offset) - - if progress_func and console: - progress_func(console, bytes_read, total_size, transfer_rate, eta) - - if write_func: - write_func(chunk) - - response.close() - assert bytes_read == total_size - return response - - -def download(url, path=".", - checksum=None, checksum_header=None, - headers=None, console=None): - - if os.path.isdir(path): - path = os.path.join(path, url.rsplit('/', 1)[1]) - path = os.path.abspath(path) - - with io.open(path, "a+b") as f: - size = f.tell() - - # update checksum of partially downloaded file - if checksum: - f.seek(0, os.SEEK_SET) - for chunk in iter(lambda: f.read(CHUNK_SIZE), b""): - checksum.update(chunk) - - def write(chunk): - if checksum: - checksum.update(chunk) - f.write(chunk) - - request = Request(url) - - # request headers - if headers: - for key, value in headers.items(): - request.add_header(key, value) - - try: - response = read_request(request, - offset=size, - console=console, - progress_func=progress, - write_func=write) - except InvalidOffsetException: - response = None - - if checksum: - if response: - origin_checksum = response.headers.get(checksum_header) - else: - # check whether file is already complete - meta = get_url_meta(url, checksum_header) - origin_checksum = meta.get('checksum') - - if origin_checksum is None: - raise MissingChecksumHeader - - if checksum.hexdigest() != origin_checksum: - raise InvalidChecksumException - - if console: - console.write("checksum/sha256 OK\n") - - return path