spaCy/spacy/util.py

219 lines
6.6 KiB
Python
Raw Normal View History

2017-03-12 12:07:28 +00:00
# coding: utf8
from __future__ import unicode_literals, print_function
2017-04-15 10:13:34 +00:00
import ujson
import pip
import importlib
2017-04-19 23:22:52 +00:00
import regex as re
from pathlib import Path
2017-03-16 16:08:58 +00:00
import sys
import textwrap
2017-05-07 21:25:29 +00:00
from .compat import path2str, basestring_, input_
2016-03-25 17:54:45 +00:00
LANGUAGES = {}
_data_path = Path(__file__).parent / 'data'
2016-03-25 17:54:45 +00:00
2016-03-26 10:44:53 +00:00
def set_lang_class(name, cls):
2016-03-25 17:54:45 +00:00
global LANGUAGES
LANGUAGES[name] = cls
2016-03-26 10:44:53 +00:00
def get_lang_class(name):
if name in LANGUAGES:
return LANGUAGES[name]
2016-05-16 23:40:31 +00:00
lang = re.split('[^a-zA-Z0-9]', name, 1)[0]
2016-03-25 17:54:45 +00:00
if lang not in LANGUAGES:
raise RuntimeError('Language not supported: %s' % name)
2016-03-25 17:54:45 +00:00
return LANGUAGES[lang]
2017-01-09 22:40:26 +00:00
def get_data_path(require_exists=True):
if not require_exists:
return _data_path
else:
return _data_path if _data_path.exists() else None
2016-09-24 18:26:17 +00:00
def set_data_path(path):
global _data_path
_data_path = ensure_path(path)
def ensure_path(path):
if isinstance(path, basestring_):
return Path(path)
else:
return path
2016-09-24 18:26:17 +00:00
def read_regex(path):
path = ensure_path(path)
2016-09-24 18:26:17 +00:00
with path.open() as file_:
entries = file_.read().split('\n')
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
return re.compile(expression)
def compile_prefix_regex(entries):
if '(' in entries:
# Handle deprecated data
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
return re.compile(expression)
else:
expression = '|'.join(['^' + piece for piece in entries if piece.strip()])
return re.compile(expression)
2016-09-24 18:26:17 +00:00
def compile_suffix_regex(entries):
2016-09-24 18:26:17 +00:00
expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
return re.compile(expression)
def compile_infix_regex(entries):
2016-09-24 18:26:17 +00:00
expression = '|'.join([piece for piece in entries if piece.strip()])
return re.compile(expression)
def normalize_slice(length, start, stop, step=None):
if not (step is None or step == 1):
raise ValueError("Stepped slices not supported in Span objects."
"Try: list(tokens)[start:stop:step] instead.")
if start is None:
start = 0
elif start < 0:
start += length
start = min(length, max(0, start))
if stop is None:
stop = length
elif stop < 0:
stop += length
stop = min(length, max(start, stop))
assert 0 <= start <= stop <= length
return start, stop
def check_renamed_kwargs(renamed, kwargs):
for old, new in renamed.items():
if old in kwargs:
raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
def read_json(location):
with location.open('r', encoding='utf8') as f:
return ujson.load(f)
def is_package(origin):
"""
Check if string maps to a package installed via pip.
"""
packages = pip.get_installed_distributions()
for package in packages:
if package.project_name.replace('-', '_') == origin:
return True
return False
def get_model_package_path(package_name):
# Here we're importing the module just to find it. This is worryingly
# indirect, but it's otherwise very difficult to find the package.
# Python's installation and import rules are very complicated.
pkg = importlib.import_module(package_name)
package_path = Path(pkg.__file__).parent.parent
meta = parse_package_meta(package_path, package_name)
model_name = '%s-%s' % (package_name, meta['version'])
return package_path / package_name / model_name
2017-03-16 23:30:02 +00:00
def parse_package_meta(package_path, package, require=True):
2017-04-16 20:14:45 +00:00
"""
Check if a meta.json exists in a package and return its contents as a
dictionary. If require is set to True, raise an error if no meta.json found.
"""
# TODO: Allow passing in full model path and only require one argument
# instead of path and package name. This lets us avoid passing in an awkward
# empty string in spacy.load() if user supplies full model path.
location = package_path / package / 'meta.json'
if location.is_file():
return read_json(location)
2017-03-16 23:30:02 +00:00
elif require:
raise IOError("Could not read meta.json from %s" % location)
else:
return None
def get_raw_input(description, default=False):
2017-04-16 11:42:34 +00:00
"""
Get user input via raw_input / input and return input value. Takes a
2017-05-07 21:25:29 +00:00
description, and an optional default value to display with the prompt.
2017-04-16 11:42:34 +00:00
"""
2017-05-07 21:25:29 +00:00
additional = ' (default: %s)' % default if default else ''
prompt = ' %s%s: ' % (description, additional)
user_input = input_(prompt)
return user_input
2017-05-07 21:25:29 +00:00
def print_table(data, title=None):
2017-04-16 11:42:34 +00:00
"""
Print data in table format. Can either take a list of tuples or a
dictionary, which will be converted to a list of tuples.
"""
if type(data) == dict:
data = list(data.items())
2017-05-07 21:25:29 +00:00
tpl_row = ' {:<15}' * len(data[0])
table = '\n'.join([tpl_row.format(l, v) for l, v in data])
2017-05-07 21:25:29 +00:00
if title:
print('\n \033[93m{}\033[0m'.format(title))
print('\n{}\n'.format(table))
2017-05-07 21:25:29 +00:00
def print_markdown(data, title=None):
2017-04-16 11:42:34 +00:00
"""
Print listed data in GitHub-flavoured Markdown format so it can be
2017-05-07 21:25:29 +00:00
copy-pasted into issues. Can either take a list of tuples or a dictionary.
2017-04-16 11:42:34 +00:00
"""
def excl_value(value):
2017-05-07 21:25:29 +00:00
return Path(value).exists() # contains path (personal info)
if type(data) == dict:
data = list(data.items())
2017-05-07 21:25:29 +00:00
markdown = ["* **{}:** {}".format(l, v) for l, v in data if not excl_value(v)]
if title:
print("\n## {}".format(title))
print('\n{}\n'.format('\n'.join(markdown)))
2017-05-07 23:05:24 +00:00
def prints(*texts, **kwargs title=None, exits=False):
2017-04-16 11:42:34 +00:00
"""
Print formatted message. Each positional argument is rendered as newline-
2017-05-07 21:25:29 +00:00
separated paragraph. An optional highlighted title is printed above the text
(using ANSI escape sequences manually to avoid unnecessary dependency).
2017-04-16 11:42:34 +00:00
"""
2017-05-07 23:05:24 +00:00
exits = kwargs.get('exits', False)
title = kwargs.get('title', None)
2017-05-07 21:25:29 +00:00
title = '\033[93m{}\033[0m\n'.format(_wrap(title)) if title else ''
message = '\n\n'.join([_wrap(text) for text in texts])
print('\n{}{}\n'.format(title, message))
if exits:
sys.exit(0)
2017-05-07 21:25:29 +00:00
def _wrap(text, wrap_max=80, indent=4):
2017-04-16 11:42:34 +00:00
"""
Wrap text at given width using textwrap module. Indent should consist of
spaces. Its length is deducted from wrap width to ensure exact wrapping.
"""
2017-05-07 21:25:29 +00:00
indent = indent * ' '
wrap_width = wrap_max - len(indent)
2017-05-07 21:25:29 +00:00
if isinstance(text, Path):
text = path2str(text)
return textwrap.fill(text, width=wrap_width, initial_indent=indent,
2017-05-07 21:25:29 +00:00
subsequent_indent=indent, break_long_words=False,
break_on_hyphens=False)