spaCy/spacy/util.py

# coding: utf8
from __future__ import unicode_literals, print_function

import ujson
import pip
import importlib
import regex as re
from pathlib import Path
import sys
import textwrap

from .compat import basestring_, unicode_, input_


LANGUAGES = {}
_data_path = Path(__file__).parent / 'data'


def set_lang_class(name, cls):
    global LANGUAGES
    LANGUAGES[name] = cls


def get_lang_class(name):
    if name in LANGUAGES:
        return LANGUAGES[name]
    lang = re.split('[^a-zA-Z0-9]', name, 1)[0]
    if lang not in LANGUAGES:
        raise RuntimeError('Language not supported: %s' % name)
    return LANGUAGES[lang]


def get_data_path(require_exists=True):
    if not require_exists:
        return _data_path
    else:
        return _data_path if _data_path.exists() else None


def set_data_path(path):
    global _data_path
    _data_path = ensure_path(path)


def ensure_path(path):
    if isinstance(path, basestring_):
        return Path(path)
    else:
        return path


def read_regex(path):
    path = ensure_path(path)
    with path.open() as file_:
        entries = file_.read().split('\n')
    expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
    return re.compile(expression)


def compile_prefix_regex(entries):
    if '(' in entries:
        # Handle deprecated data
        expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
        return re.compile(expression)
    else:
        expression = '|'.join(['^' + piece for piece in entries if piece.strip()])
        return re.compile(expression)


def compile_suffix_regex(entries):
    expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
    return re.compile(expression)


def compile_infix_regex(entries):
    expression = '|'.join([piece for piece in entries if piece.strip()])
    return re.compile(expression)


def normalize_slice(length, start, stop, step=None):
    if not (step is None or step == 1):
        raise ValueError("Stepped slices not supported in Span objects."
                         "Try: list(tokens)[start:stop:step] instead.")
    if start is None:
       start = 0
    elif start < 0:
       start += length
    start = min(length, max(0, start))

    if stop is None:
       stop = length
    elif stop < 0:
       stop += length
    stop = min(length, max(start, stop))

    assert 0 <= start <= stop <= length
    return start, stop


def check_renamed_kwargs(renamed, kwargs):
    for old, new in renamed.items():
        if old in kwargs:
            raise TypeError("Keyword argument %s now renamed to %s" % (old, new))


def read_json(location):
    with location.open('r', encoding='utf8') as f:
        return ujson.load(f)


def is_package(origin):
    """
    Check if string maps to a package installed via pip.
    """
    packages = pip.get_installed_distributions()
    for package in packages:
        if package.project_name.replace('-', '_') == origin:
            return True
    return False


def get_model_package_path(package_name):
    # Here we're importing the module just to find it. This is worryingly
    # indirect, but it's otherwise very difficult to find the package.
    # Python's installation and import rules are very complicated.
    pkg = importlib.import_module(package_name)
    package_path = Path(pkg.__file__).parent.parent
    meta = parse_package_meta(package_path, package_name)
    model_name = '%s-%s' % (package_name, meta['version'])
    return package_path / package_name / model_name


def parse_package_meta(package_path, package, require=True):
    """
    Check if a meta.json exists in a package and return its contents as a
    dictionary. If require is set to True, raise an error if no meta.json found.
    """
    # TODO: Allow passing in full model path and only require one argument
    # instead of path and package name. This lets us avoid passing in an awkward
    # empty string in spacy.load() if user supplies full model path.
    location = package_path / package / 'meta.json'
    if location.is_file():
        return read_json(location)
    elif require:
        raise IOError("Could not read meta.json from %s" % location)
    else:
        return None


def get_raw_input(description, default=False):
    """
    Get user input via raw_input / input and return input value. Takes a
    description for the prompt, and an optional default value that's displayed
    with the prompt.
    """
    additional = ' (default: {d})'.format(d=default) if default else ''
    prompt = '    {d}{a}: '.format(d=description, a=additional)
    user_input = input_(prompt)
    return user_input


def print_table(data, **kwargs):
    """
    Print data in table format. Can either take a list of tuples or a
    dictionary, which will be converted to a list of tuples.
    """
    if type(data) == dict:
        data = list(data.items())

    tpl_msg = '\n{msg}\n'
    tpl_title = '\n    \033[93m{msg}\033[0m'
    tpl_row ="    {:<15}" * len(data[0])
    table = '\n'.join([tpl_row.format(l, v) for l, v in data])

    if 'title' in kwargs and kwargs['title']:
        print(tpl_title.format(msg=kwargs['title']))

    print(tpl_msg.format(msg=table))


def print_markdown(data, **kwargs):
    """
    Print listed data in GitHub-flavoured Markdown format so it can be
    copy-pasted into issues. Can either take a list of tuples or a dictionary,
    which will be converted to a list of tuples.
    """
    def excl_value(value):
        # don't print value if it contains absolute path of directory (i.e.
        # personal info). Other conditions can be included here if necessary.
        if unicode_(Path(__file__).parent) in value:
            return True

    if type(data) == dict:
        data = list(data.items())

    tpl_msg = "\n{msg}\n"
    tpl_title = "\n## {msg}"
    tpl_row = "* **{l}:** {v}"
    markdown = '\n'.join([tpl_row.format(l=l, v=v) for l, v in data if not excl_value(v)])

    if 'title' in kwargs and kwargs['title']:
        print(tpl_title.format(msg=kwargs['title']))
    print(tpl_msg.format(msg=markdown))


def print_msg(*text, **kwargs):
    """
    Print formatted message. Each positional argument is rendered as newline-
    separated paragraph. If kwarg 'title' exist, title is printed above the text
    and highlighted (using ANSI escape sequences manually to avoid unnecessary
    dependency).
    """
    message = '\n\n'.join([_wrap_text(t) for t in text])
    tpl_msg = '\n{msg}\n'
    tpl_title = '\n\033[93m{msg}\033[0m'

    if 'title' in kwargs and kwargs['title']:
        title = _wrap_text(kwargs['title'])
        print(tpl_title.format(msg=title))
    print(tpl_msg.format(msg=message))


def _wrap_text(text):
    """
    Wrap text at given width using textwrap module. Indent should consist of
    spaces. Its length is deducted from wrap width to ensure exact wrapping.
    """
    wrap_max = 80
    indent = '    '
    wrap_width = wrap_max - len(indent)
    return textwrap.fill(text, width=wrap_width, initial_indent=indent,
                               subsequent_indent=indent, break_long_words=False,
                               break_on_hyphens=False)


def sys_exit(*messages, **kwargs):
    """
    Performs SystemExit. For modules used from the command line, like
    download and link. To print message, use the same arguments as for
    print_msg().
    """
    if messages:
        print_msg(*messages, **kwargs)
    sys.exit(0)
Use consistent unicode declarations 2017-03-12 12:07:28 +00:00			`# coding: utf8`
Add util functions for printing and wrapping messages 2017-03-15 16:35:57 +00:00			`from __future__ import unicode_literals, print_function`
Clean up imports, unused code, whitespace, docstrings 2017-04-15 10:05:47 +00:00
Fix json imports and use ujson 2017-04-15 10:13:34 +00:00			`import ujson`
Move is_package and get_model_package_path to util 2017-05-07 21:24:51 +00:00			`import pip`
			`import importlib`
Use `regex` instead of `re` 2017-04-19 23:22:52 +00:00			`import regex as re`
Clean up imports, unused code, whitespace, docstrings 2017-04-15 10:05:47 +00:00			`from pathlib import Path`
Move sys_exit() function to util 2017-03-16 16:08:58 +00:00			`import sys`
Add util functions for printing and wrapping messages 2017-03-15 16:35:57 +00:00			`import textwrap`

Add compat functions and remove old workarounds Add ensure_path util function to handle checking instance of path 2017-04-15 10:11:16 +00:00			`from .compat import basestring_, unicode_, input_`
Handle raw_input vs input in Python 2 and 3 2017-03-20 21:48:32 +00:00

add lang registration facility 2016-03-25 17:54:45 +00:00			`LANGUAGES = {}`
Clean up imports, unused code, whitespace, docstrings 2017-04-15 10:05:47 +00:00			`_data_path = Path(__file__).parent / 'data'`
add lang registration facility 2016-03-25 17:54:45 +00:00

relative imports in __init__.py 2016-03-26 10:44:53 +00:00			`def set_lang_class(name, cls):`
add lang registration facility 2016-03-25 17:54:45 +00:00			`global LANGUAGES`
			`LANGUAGES[name] = cls`


relative imports in __init__.py 2016-03-26 10:44:53 +00:00			`def get_lang_class(name):`
Check if full string is found in lang classes first This allows users to set arbitrary strings. (Otherwise, custom lang class "my_custom_class" would always load Burmese "my" tokenizer if one was available.) 2017-04-16 20:14:38 +00:00			`if name in LANGUAGES:`
			`return LANGUAGES[name]`
Fix get_lang_class parsing (take 2) 2016-05-16 23:40:31 +00:00			`lang = re.split('[^a-zA-Z0-9]', name, 1)[0]`
add lang registration facility 2016-03-25 17:54:45 +00:00			`if lang not in LANGUAGES:`
Check if full string is found in lang classes first This allows users to set arbitrary strings. (Otherwise, custom lang class "my_custom_class" would always load Burmese "my" tokenizer if one was available.) 2017-04-16 20:14:38 +00:00			`raise RuntimeError('Language not supported: %s' % name)`
add lang registration facility 2016-03-25 17:54:45 +00:00			`return LANGUAGES[lang]`


Unbreak data download 2017-01-09 22:40:26 +00:00			`def get_data_path(require_exists=True):`
			`if not require_exists:`
			`return _data_path`
			`else:`
			`return _data_path if _data_path.exists() else None`
Finish refactoring data loading 2016-09-24 18:26:17 +00:00

			`def set_data_path(path):`
			`global _data_path`
Add compat functions and remove old workarounds Add ensure_path util function to handle checking instance of path 2017-04-15 10:11:16 +00:00			`_data_path = ensure_path(path)`


			`def ensure_path(path):`
			`if isinstance(path, basestring_):`
			`return Path(path)`
			`else:`
			`return path`
Finish refactoring data loading 2016-09-24 18:26:17 +00:00

			`def read_regex(path):`
Add compat functions and remove old workarounds Add ensure_path util function to handle checking instance of path 2017-04-15 10:11:16 +00:00			`path = ensure_path(path)`
Finish refactoring data loading 2016-09-24 18:26:17 +00:00			`with path.open() as file_:`
			`entries = file_.read().split('\n')`
			`expression = '\|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])`
			`return re.compile(expression)`


Refactor so that the tokenizer data is read from Python data, rather than from disk 2016-09-25 12:49:53 +00:00			`def compile_prefix_regex(entries):`
Handle deprecated tokenizer prefix data 2017-01-08 19:33:28 +00:00			`if '(' in entries:`
			`# Handle deprecated data`
			`expression = '\|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])`
			`return re.compile(expression)`
			`else:`
			`expression = '\|'.join(['^' + piece for piece in entries if piece.strip()])`
			`return re.compile(expression)`
Finish refactoring data loading 2016-09-24 18:26:17 +00:00

Refactor so that the tokenizer data is read from Python data, rather than from disk 2016-09-25 12:49:53 +00:00			`def compile_suffix_regex(entries):`
Finish refactoring data loading 2016-09-24 18:26:17 +00:00			`expression = '\|'.join([piece + '$' for piece in entries if piece.strip()])`
			`return re.compile(expression)`


Refactor so that the tokenizer data is read from Python data, rather than from disk 2016-09-25 12:49:53 +00:00			`def compile_infix_regex(entries):`
Finish refactoring data loading 2016-09-24 18:26:17 +00:00			`expression = '\|'.join([piece for piece in entries if piece.strip()])`
			`return re.compile(expression)`


Refactor to remove duplicate slicing logic 2015-10-07 08:25:35 +00:00			`def normalize_slice(length, start, stop, step=None):`
			`if not (step is None or step == 1):`
			`raise ValueError("Stepped slices not supported in Span objects."`
			`"Try: list(tokens)[start:stop:step] instead.")`
			`if start is None:`
			`start = 0`
			`elif start < 0:`
			`start += length`
			`start = min(length, max(0, start))`

			`if stop is None:`
			`stop = length`
			`elif stop < 0:`
			`stop += length`
			`stop = min(length, max(start, stop))`

			`assert 0 <= start <= stop <= length`
			`return start, stop`


Refactor so that the tokenizer data is read from Python data, rather than from disk 2016-09-25 12:49:53 +00:00			`def check_renamed_kwargs(renamed, kwargs):`
			`for old, new in renamed.items():`
			`if old in kwargs:`
			`raise TypeError("Keyword argument %s now renamed to %s" % (old, new))`
Add util functions for printing and wrapping messages 2017-03-15 16:35:57 +00:00

Move read_json out to own util function 2017-04-16 11:03:28 +00:00			`def read_json(location):`
			`with location.open('r', encoding='utf8') as f:`
			`return ujson.load(f)`


Move is_package and get_model_package_path to util 2017-05-07 21:24:51 +00:00			`def is_package(origin):`
			`"""`
			`Check if string maps to a package installed via pip.`
			`"""`
			`packages = pip.get_installed_distributions()`
			`for package in packages:`
			`if package.project_name.replace('-', '_') == origin:`
			`return True`
			`return False`



			`def get_model_package_path(package_name):`
			`# Here we're importing the module just to find it. This is worryingly`
			`# indirect, but it's otherwise very difficult to find the package.`
			`# Python's installation and import rules are very complicated.`
			`pkg = importlib.import_module(package_name)`
			`package_path = Path(pkg.__file__).parent.parent`
			`meta = parse_package_meta(package_path, package_name)`
			`model_name = '%s-%s' % (package_name, meta['version'])`
			`return package_path / package_name / model_name`


Fix loading when no package found 2017-03-16 23:30:02 +00:00			`def parse_package_meta(package_path, package, require=True):`
Add docstring and todo note 2017-04-16 20:14:45 +00:00			`"""`
			`Check if a meta.json exists in a package and return its contents as a`
			`dictionary. If require is set to True, raise an error if no meta.json found.`
			`"""`
			`# TODO: Allow passing in full model path and only require one argument`
			`# instead of path and package name. This lets us avoid passing in an awkward`
			`# empty string in spacy.load() if user supplies full model path.`
Clean up imports, unused code, whitespace, docstrings 2017-04-15 10:05:47 +00:00			`location = package_path / package / 'meta.json'`
			`if location.is_file():`
Move read_json out to own util function 2017-04-16 11:03:28 +00:00			`return read_json(location)`
Fix loading when no package found 2017-03-16 23:30:02 +00:00			`elif require:`
			`raise IOError("Could not read meta.json from %s" % location)`
			`else:`
			`return None`
Add util function to load and parse package meta.json 2017-03-16 16:10:05 +00:00

Add util function to get raw user input 2017-03-20 21:48:56 +00:00			`def get_raw_input(description, default=False):`
Fix formatting 2017-04-16 11:42:34 +00:00			`"""`
			`Get user input via raw_input / input and return input value. Takes a`
Add util function to get raw user input 2017-03-20 21:48:56 +00:00			`description for the prompt, and an optional default value that's displayed`
Fix formatting 2017-04-16 11:42:34 +00:00			`with the prompt.`
			`"""`
Add util function to get raw user input 2017-03-20 21:48:56 +00:00			`additional = ' (default: {d})'.format(d=default) if default else ''`
			`prompt = ' {d}{a}: '.format(d=description, a=additional)`
Add compat functions and remove old workarounds Add ensure_path util function to handle checking instance of path 2017-04-15 10:11:16 +00:00			`user_input = input_(prompt)`
Add util function to get raw user input 2017-03-20 21:48:56 +00:00			`return user_input`


Add util functions to print data as table or markdown list 2017-03-18 12:00:14 +00:00			`def print_table(data, **kwargs):`
Fix formatting 2017-04-16 11:42:34 +00:00			`"""`
			`Print data in table format. Can either take a list of tuples or a`
			`dictionary, which will be converted to a list of tuples.`
			`"""`
Add util functions to print data as table or markdown list 2017-03-18 12:00:14 +00:00			`if type(data) == dict:`
			`data = list(data.items())`

			`tpl_msg = '\n{msg}\n'`
			`tpl_title = '\n \033[93m{msg}\033[0m'`
			`tpl_row =" {:<15}" * len(data[0])`
			`table = '\n'.join([tpl_row.format(l, v) for l, v in data])`

			`if 'title' in kwargs and kwargs['title']:`
			`print(tpl_title.format(msg=kwargs['title']))`

			`print(tpl_msg.format(msg=table))`


			`def print_markdown(data, **kwargs):`
Fix formatting 2017-04-16 11:42:34 +00:00			`"""`
			`Print listed data in GitHub-flavoured Markdown format so it can be`
Add util functions to print data as table or markdown list 2017-03-18 12:00:14 +00:00			`copy-pasted into issues. Can either take a list of tuples or a dictionary,`
Fix formatting 2017-04-16 11:42:34 +00:00			`which will be converted to a list of tuples.`
			`"""`
Add util functions to print data as table or markdown list 2017-03-18 12:00:14 +00:00			`def excl_value(value):`
Clean up imports, unused code, whitespace, docstrings 2017-04-15 10:05:47 +00:00			`# don't print value if it contains absolute path of directory (i.e.`
			`# personal info). Other conditions can be included here if necessary.`
			`if unicode_(Path(__file__).parent) in value:`
Add util functions to print data as table or markdown list 2017-03-18 12:00:14 +00:00			`return True`

			`if type(data) == dict:`
			`data = list(data.items())`

			`tpl_msg = "\n{msg}\n"`
			`tpl_title = "\n## {msg}"`
			`tpl_row = "* {l}: {v}"`
			`markdown = '\n'.join([tpl_row.format(l=l, v=v) for l, v in data if not excl_value(v)])`

			`if 'title' in kwargs and kwargs['title']:`
			`print(tpl_title.format(msg=kwargs['title']))`
			`print(tpl_msg.format(msg=markdown))`


Add util functions for printing and wrapping messages 2017-03-15 16:35:57 +00:00			`def print_msg(text, *kwargs):`
Fix formatting 2017-04-16 11:42:34 +00:00			`"""`
			`Print formatted message. Each positional argument is rendered as newline-`
Add util functions for printing and wrapping messages 2017-03-15 16:35:57 +00:00			`separated paragraph. If kwarg 'title' exist, title is printed above the text`
			`and highlighted (using ANSI escape sequences manually to avoid unnecessary`
Fix formatting 2017-04-16 11:42:34 +00:00			`dependency).`
			`"""`
Add util functions for printing and wrapping messages 2017-03-15 16:35:57 +00:00			`message = '\n\n'.join([_wrap_text(t) for t in text])`
			`tpl_msg = '\n{msg}\n'`
			`tpl_title = '\n\033[93m{msg}\033[0m'`

			`if 'title' in kwargs and kwargs['title']:`
			`title = _wrap_text(kwargs['title'])`
			`print(tpl_title.format(msg=title))`
			`print(tpl_msg.format(msg=message))`


			`def _wrap_text(text):`
Fix formatting 2017-04-16 11:42:34 +00:00			`"""`
			`Wrap text at given width using textwrap module. Indent should consist of`
			`spaces. Its length is deducted from wrap width to ensure exact wrapping.`
			`"""`
Add util functions for printing and wrapping messages 2017-03-15 16:35:57 +00:00			`wrap_max = 80`
			`indent = ' '`
			`wrap_width = wrap_max - len(indent)`
			`return textwrap.fill(text, width=wrap_width, initial_indent=indent,`
Don't break text in when rendering print_msg 2017-03-16 16:09:50 +00:00			`subsequent_indent=indent, break_long_words=False,`
			`break_on_hyphens=False)`
Move sys_exit() function to util 2017-03-16 16:08:58 +00:00

			`def sys_exit(messages, *kwargs):`
Fix formatting 2017-04-16 11:42:34 +00:00			`"""`
			`Performs SystemExit. For modules used from the command line, like`
Move sys_exit() function to util 2017-03-16 16:08:58 +00:00			`download and link. To print message, use the same arguments as for`
Fix formatting 2017-04-16 11:42:34 +00:00			`print_msg().`
			`"""`
Move sys_exit() function to util 2017-03-16 16:08:58 +00:00			`if messages:`
			`print_msg(messages, *kwargs)`
			`sys.exit(0)`