mirror of https://github.com/explosion/spaCy.git
275 lines
8.8 KiB
Python
275 lines
8.8 KiB
Python
# coding: utf8
|
||
from __future__ import unicode_literals, print_function
|
||
|
||
import ujson
|
||
import pip
|
||
import importlib
|
||
import regex as re
|
||
from pathlib import Path
|
||
import sys
|
||
import textwrap
|
||
|
||
from .symbols import ORTH
|
||
from .compat import path2str, basestring_, input_, unicode_
|
||
|
||
|
||
LANGUAGES = {}
|
||
_data_path = Path(__file__).parent / 'data'
|
||
|
||
|
||
def set_lang_class(name, cls):
|
||
global LANGUAGES
|
||
LANGUAGES[name] = cls
|
||
|
||
|
||
def get_lang_class(name):
|
||
if name in LANGUAGES:
|
||
return LANGUAGES[name]
|
||
lang = re.split('[^a-zA-Z0-9]', name, 1)[0]
|
||
if lang not in LANGUAGES:
|
||
raise RuntimeError('Language not supported: %s' % name)
|
||
return LANGUAGES[lang]
|
||
|
||
|
||
def get_data_path(require_exists=True):
|
||
if not require_exists:
|
||
return _data_path
|
||
else:
|
||
return _data_path if _data_path.exists() else None
|
||
|
||
|
||
def set_data_path(path):
|
||
global _data_path
|
||
_data_path = ensure_path(path)
|
||
|
||
|
||
def ensure_path(path):
|
||
if isinstance(path, basestring_):
|
||
return Path(path)
|
||
else:
|
||
return path
|
||
|
||
|
||
def read_regex(path):
|
||
path = ensure_path(path)
|
||
with path.open() as file_:
|
||
entries = file_.read().split('\n')
|
||
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
|
||
return re.compile(expression)
|
||
|
||
|
||
def compile_prefix_regex(entries):
|
||
if '(' in entries:
|
||
# Handle deprecated data
|
||
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
|
||
return re.compile(expression)
|
||
else:
|
||
expression = '|'.join(['^' + piece for piece in entries if piece.strip()])
|
||
return re.compile(expression)
|
||
|
||
|
||
def compile_suffix_regex(entries):
|
||
expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
|
||
return re.compile(expression)
|
||
|
||
|
||
def compile_infix_regex(entries):
|
||
expression = '|'.join([piece for piece in entries if piece.strip()])
|
||
return re.compile(expression)
|
||
|
||
|
||
def update_exc(base_exceptions, *addition_dicts):
|
||
exc = dict(base_exceptions)
|
||
for additions in addition_dicts:
|
||
for orth, token_attrs in additions.items():
|
||
if not all(isinstance(attr[ORTH], unicode_) for attr in token_attrs):
|
||
msg = "Invalid value for ORTH in exception: key='%s', orths='%s'"
|
||
raise ValueError(msg % (orth, token_attrs))
|
||
described_orth = ''.join(attr[ORTH] for attr in token_attrs)
|
||
if orth != described_orth:
|
||
# TODO: Better error
|
||
msg = "Invalid tokenizer exception: key='%s', orths='%s'"
|
||
raise ValueError(msg % (orth, described_orth))
|
||
# overlap = set(exc.keys()).intersection(set(additions))
|
||
# assert not overlap, overlap
|
||
exc.update(additions)
|
||
expand_exc(exc, "'", "’")
|
||
return exc
|
||
|
||
|
||
def expand_exc(excs, search, replace):
|
||
def _fix_token(token, search, replace):
|
||
fixed = dict(token)
|
||
fixed[ORTH] = fixed[ORTH].replace(search, replace)
|
||
return fixed
|
||
updates = {}
|
||
for token_string, tokens in excs.items():
|
||
if search in token_string:
|
||
new_key = token_string.replace(search, replace)
|
||
new_value = [_fix_token(t, search, replace) for t in tokens]
|
||
updates[new_key] = new_value
|
||
return updates
|
||
|
||
|
||
def normalize_slice(length, start, stop, step=None):
|
||
if not (step is None or step == 1):
|
||
raise ValueError("Stepped slices not supported in Span objects."
|
||
"Try: list(tokens)[start:stop:step] instead.")
|
||
if start is None:
|
||
start = 0
|
||
elif start < 0:
|
||
start += length
|
||
start = min(length, max(0, start))
|
||
|
||
if stop is None:
|
||
stop = length
|
||
elif stop < 0:
|
||
stop += length
|
||
stop = min(length, max(start, stop))
|
||
|
||
assert 0 <= start <= stop <= length
|
||
return start, stop
|
||
|
||
|
||
def check_renamed_kwargs(renamed, kwargs):
|
||
for old, new in renamed.items():
|
||
if old in kwargs:
|
||
raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
|
||
|
||
|
||
def read_json(location):
|
||
with location.open('r', encoding='utf8') as f:
|
||
return ujson.load(f)
|
||
|
||
|
||
def resolve_load_name(name, **overrides):
|
||
if overrides.get('path') not in (None, False, True):
|
||
name = overrides.get('path')
|
||
prints("To load a model from a path, you can now use the first argument. "
|
||
"The model meta is used to load the required Language class.",
|
||
"OLD: spacy.load('en', path='/some/path')", "NEW: spacy.load('/some/path')",
|
||
title="Warning: deprecated argument 'path'")
|
||
return name
|
||
|
||
|
||
def resolve_model_path(name):
|
||
data_path = get_data_path()
|
||
if not data_path or not data_path.exists():
|
||
raise IOError("Can't find spaCy data path: %s" % path2str(data_path))
|
||
if isinstance(name, basestring_):
|
||
if (data_path / name).exists(): # in data dir or shortcut link
|
||
return (data_path / name)
|
||
if is_package(name): # installed as a package
|
||
return get_model_package_path(name)
|
||
if Path(name).exists(): # path to model
|
||
return Path(name)
|
||
elif hasattr(name, 'exists'): # Path or Path-like object
|
||
return name
|
||
raise IOError("Can't find model '%s'" % name)
|
||
|
||
|
||
def is_package(origin):
|
||
"""
|
||
Check if string maps to a package installed via pip.
|
||
"""
|
||
packages = pip.get_installed_distributions()
|
||
for package in packages:
|
||
if package.project_name.replace('-', '_') == origin:
|
||
return True
|
||
return False
|
||
|
||
|
||
def get_model_package_path(package_name):
|
||
# Here we're importing the module just to find it. This is worryingly
|
||
# indirect, but it's otherwise very difficult to find the package.
|
||
# Python's installation and import rules are very complicated.
|
||
pkg = importlib.import_module(package_name)
|
||
package_path = Path(pkg.__file__).parent.parent
|
||
meta = parse_package_meta(package_path / package_name)
|
||
model_name = '%s-%s' % (package_name, meta['version'])
|
||
return package_path / package_name / model_name
|
||
|
||
|
||
def parse_package_meta(package_path, require=True):
|
||
"""
|
||
Check if a meta.json exists in a package and return its contents as a
|
||
dictionary. If require is set to True, raise an error if no meta.json found.
|
||
"""
|
||
location = package_path / 'meta.json'
|
||
if location.is_file():
|
||
return read_json(location)
|
||
elif require:
|
||
raise IOError("Could not read meta.json from %s" % location)
|
||
else:
|
||
return None
|
||
|
||
|
||
def get_raw_input(description, default=False):
|
||
"""
|
||
Get user input via raw_input / input and return input value. Takes a
|
||
description, and an optional default value to display with the prompt.
|
||
"""
|
||
additional = ' (default: %s)' % default if default else ''
|
||
prompt = ' %s%s: ' % (description, additional)
|
||
user_input = input_(prompt)
|
||
return user_input
|
||
|
||
|
||
def print_table(data, title=None):
|
||
"""
|
||
Print data in table format. Can either take a list of tuples or a
|
||
dictionary, which will be converted to a list of tuples.
|
||
"""
|
||
if type(data) == dict:
|
||
data = list(data.items())
|
||
tpl_row = ' {:<15}' * len(data[0])
|
||
table = '\n'.join([tpl_row.format(l, v) for l, v in data])
|
||
if title:
|
||
print('\n \033[93m{}\033[0m'.format(title))
|
||
print('\n{}\n'.format(table))
|
||
|
||
|
||
def print_markdown(data, title=None):
|
||
"""
|
||
Print listed data in GitHub-flavoured Markdown format so it can be
|
||
copy-pasted into issues. Can either take a list of tuples or a dictionary.
|
||
"""
|
||
def excl_value(value):
|
||
return Path(value).exists() # contains path (personal info)
|
||
|
||
if type(data) == dict:
|
||
data = list(data.items())
|
||
markdown = ["* **{}:** {}".format(l, v) for l, v in data if not excl_value(v)]
|
||
if title:
|
||
print("\n## {}".format(title))
|
||
print('\n{}\n'.format('\n'.join(markdown)))
|
||
|
||
|
||
def prints(*texts, **kwargs):
|
||
"""
|
||
Print formatted message. Each positional argument is rendered as newline-
|
||
separated paragraph. An optional highlighted title is printed above the text
|
||
(using ANSI escape sequences manually to avoid unnecessary dependency).
|
||
"""
|
||
exits = kwargs.get('exits', False)
|
||
title = kwargs.get('title', None)
|
||
title = '\033[93m{}\033[0m\n'.format(_wrap(title)) if title else ''
|
||
message = '\n\n'.join([_wrap(text) for text in texts])
|
||
print('\n{}{}\n'.format(title, message))
|
||
if exits:
|
||
sys.exit(0)
|
||
|
||
|
||
def _wrap(text, wrap_max=80, indent=4):
|
||
"""
|
||
Wrap text at given width using textwrap module. Indent should consist of
|
||
spaces. Its length is deducted from wrap width to ensure exact wrapping.
|
||
"""
|
||
indent = indent * ' '
|
||
wrap_width = wrap_max - len(indent)
|
||
if isinstance(text, Path):
|
||
text = path2str(text)
|
||
return textwrap.fill(text, width=wrap_width, initial_indent=indent,
|
||
subsequent_indent=indent, break_long_words=False,
|
||
break_on_hyphens=False)
|