From b5a726c5cdeec35e0d95d45c76780d22318ae4d6 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 7 May 2017 23:29:22 +0200 Subject: [PATCH] Tidy up deprecated.py --- spacy/deprecated.py | 87 ++++++++++----------------------------------- 1 file changed, 19 insertions(+), 68 deletions(-) diff --git a/spacy/deprecated.py b/spacy/deprecated.py index 627188b00..70da5b4a3 100644 --- a/spacy/deprecated.py +++ b/spacy/deprecated.py @@ -7,74 +7,7 @@ from . import about from . import util from .util import prints from .compat import path2str -from .cli import download -from .cli import link - - -def read_lang_data(package): - tokenization = package.load_json(('tokenizer', 'specials.json')) - with package.open(('tokenizer', 'prefix.txt'), default=None) as file_: - prefix = read_prefix(file_) if file_ is not None else None - with package.open(('tokenizer', 'suffix.txt'), default=None) as file_: - suffix = read_suffix(file_) if file_ is not None else None - with package.open(('tokenizer', 'infix.txt'), default=None) as file_: - infix = read_infix(file_) if file_ is not None else None - return tokenization, prefix, suffix, infix - - -def align_tokens(ref, indices): # Deprecated, surely? - start = 0 - queue = list(indices) - for token in ref: - end = start + len(token) - emit = [] - while queue and queue[0][1] <= end: - emit.append(queue.pop(0)) - yield token, emit - start = end - assert not queue - - -def detokenize(token_rules, words): # Deprecated? - """ - To align with treebanks, return a list of "chunks", where a chunk is a - sequence of tokens that are separated by whitespace in actual strings. Each - chunk should be a tuple of token indices, e.g. - - >>> detokenize(["can't", '!'], ["I", "ca", "n't", "!"]) - [(0,), (1, 2, 3)] - """ - string = ' '.join(words) - for subtoks in token_rules: - # Algorithmically this is dumb, but writing a little list-based match - # machine? Ain't nobody got time for that. - string = string.replace(subtoks.replace('', ' '), subtoks) - positions = [] - i = 0 - for chunk in string.split(): - subtoks = chunk.split('') - positions.append(tuple(range(i, i+len(subtoks)))) - i += len(subtoks) - return positions - - -def match_best_version(target_name, target_version, path): - path = util.ensure_path(path) - if path is None or not path.exists(): - return None - matches = [] - for data_name in path.iterdir(): - name, version = split_data_name(data_name.parts[-1]) - if name == target_name: - matches.append((tuple(float(v) for v in version.split('.')), data_name)) - if matches: - return Path(max(matches)[1]) - else: - return None - - -def split_data_name(name): - return name.split('-', 1) if '-' in name else (name, '') +from .cli import download, link def fix_glove_vectors_loading(overrides): @@ -106,6 +39,24 @@ def fix_glove_vectors_loading(overrides): return overrides +def match_best_version(target_name, target_version, path): + def split_data_name(name): + return name.split('-', 1) if '-' in name else (name, '') + + path = util.ensure_path(path) + if path is None or not path.exists(): + return None + matches = [] + for data_name in path.iterdir(): + name, version = split_data_name(data_name.parts[-1]) + if name == target_name: + matches.append((tuple(float(v) for v in version.split('.')), data_name)) + if matches: + return Path(max(matches)[1]) + else: + return None + + def resolve_model_name(name): """ If spaCy is loaded with 'de', check if symlink already exists. If