diff --git a/.appveyor.yml b/.appveyor.yml index 8f0a21967..4dcd75e9c 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -8,16 +8,24 @@ environment: matrix: # Python 2.7.10 is the latest version and is not pre-installed. - - PYTHON: "C:\\Python27.10-x64" PYTHON_VERSION: "2.7.10" PYTHON_ARCH: "64" + - PYTHON: "C:\\Python27.10-x32" + PYTHON_VERSION: "2.7.10" + PYTHON_ARCH: "32" + # The lastest Python 3.4. - PYTHON: "C:\\Python34-x64" PYTHON_VERSION: "3.4.x" # currently 3.4.3 PYTHON_ARCH: "64" + #- PYTHON: "C:\\Python34-x32" + # PYTHON_VERSION: "3.4.x" # currently 3.4.3 + # PYTHON_ARCH: "32" + + install: # Install Python (from the official .msi of http://python.org) and pip when # not already installed. @@ -30,10 +38,11 @@ install: - "SET PYTHONPATH=%CD%;%PYTHONPATH%" # Filesystem root - # - ps: "ls \"C:/\"" + #- ps: "ls \"C:/\"" + #- SET # Installed SDKs - # - ps: "ls \"C:/Program Files/Microsoft SDKs/Windows\"" + #- ps: "ls \"C:/Program Files/Microsoft SDKs/Windows\"" # Checking stdint.h #- ps: "ls \"C:/projects/spacy/include/\"" diff --git a/README.md b/README.md index ad384fd2b..8eb39ba01 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,6 @@ + + + spaCy: Industrial-strength NLP ============================== @@ -49,3 +52,6 @@ Difficult to support: * PyPy 2.7 * PyPy 3.4 + + + diff --git a/spacy/en/download.py b/spacy/en/download.py index 6180c4766..11ed96caa 100644 --- a/spacy/en/download.py +++ b/spacy/en/download.py @@ -1,5 +1,4 @@ from __future__ import print_function -from os import path import sys import os import tarfile @@ -15,45 +14,44 @@ AWS_STORE = 'https://s3-us-west-1.amazonaws.com/media.spacynlp.com' ALL_DATA_DIR_URL = '%s/en_data_all-%s.tgz' % (AWS_STORE, VERSION) -DEST_DIR = path.join(path.dirname(path.abspath(__file__)), 'data') +DEST_DIR = os.path.dirname(os.path.abspath(__file__)) -def download_file(url, dest_dir): - return uget.download(url, dest_dir, console=sys.stdout) +def download_file(url, download_path): + return uget.download(url, download_path, console=sys.stdout) -def install_data(url, dest_dir): - filename = download_file(url, dest_dir) - t = tarfile.open(filename) - t.extractall(dest_dir) +def install_data(url, extract_path, download_path): + try: + os.makedirs(extract_path) + except FileExistsError: + pass - -def install_parser_model(url, dest_dir): - filename = download_file(url, dest_dir) - t = tarfile.open(filename, mode=":gz") - t.extractall(dest_dir) - - -def install_dep_vectors(url, dest_dir): - download_file(url, dest_dir) + tmp = download_file(url, download_path) + assert tmp == download_path + t = tarfile.open(download_path) + t.extractall(extract_path) @plac.annotations( force=("Force overwrite", "flag", "f", bool), ) def main(data_size='all', force=False): - if data_size == 'all': - data_url = ALL_DATA_DIR_URL - elif data_size == 'small': - data_url = SM_DATA_DIR_URL + filename = ALL_DATA_DIR_URL.rsplit('/', 1)[1] + download_path = os.path.join(DEST_DIR, filename) + data_path = os.path.join(DEST_DIR, 'data') - if force and path.exists(DEST_DIR): - shutil.rmtree(DEST_DIR) + if force and os.path.exists(download_path): + os.unlink(download_path) - if not os.path.exists(DEST_DIR): - os.makedirs(DEST_DIR) + if force and os.path.exists(data_path): + shutil.rmtree(data_path) - install_data(data_url, DEST_DIR) + if os.path.exists(data_path): + print('data already installed at %s, overwrite with --force' % DEST_DIR) + sys.exit(1) + + install_data(ALL_DATA_DIR_URL, DEST_DIR, download_path) if __name__ == '__main__': diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index c1d296d7c..08e511f68 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -19,7 +19,7 @@ class Lemmatizer(object): index[pos] = read_index(path.join(data_dir, 'wordnet', 'index.%s' % pos)) exc[pos] = read_exc(path.join(data_dir, 'wordnet', '%s.exc' % pos)) if path.exists(path.join(data_dir, 'vocab', 'lemma_rules.json')): - rules = json.load(open(path.join(data_dir, 'vocab', 'lemma_rules.json'))) + rules = json.load(codecs.open(path.join(data_dir, 'vocab', 'lemma_rules.json'), encoding='utf_8')) else: rules = {} return cls(index, exc, rules) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index c0cc6803b..93be3e363 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -120,6 +120,9 @@ cdef class Doc: def __str__(self): return u''.join([t.string for t in self]) + def __repr__(self): + return u''.join([t.string for t in self]) + def similarity(self, other): if self.vector_norm == 0 or other.vector_norm == 0: return 0.0 diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx index e8d2f2e59..e1b881f79 100644 --- a/spacy/tokens/spans.pyx +++ b/spacy/tokens/spans.pyx @@ -46,6 +46,12 @@ cdef class Span: return 0 return self.end - self.start + def __repr__(self): + text = self.text_with_ws + if self[-1].whitespace_: + text = text[:-1] + return text + def __getitem__(self, object i): if isinstance(i, slice): start, end = normalize_slice(len(self), i.start, i.stop, i.step) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index a7447fb79..cce8eeeb4 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -43,6 +43,9 @@ cdef class Token: def __str__(self): return self.string + def __repr__(self): + return self.string + cpdef bint check_flag(self, attr_id_t flag_id) except -1: return Lexeme.c_check_flag(self.c.lex, flag_id)