spaCy/spacy/util.py

# coding: utf8
from __future__ import unicode_literals, print_function

import ujson
import pip
import importlib
import regex as re
from pathlib import Path
import sys
import textwrap

from .compat import path2str, basestring_, input_


LANGUAGES = {}
_data_path = Path(__file__).parent / 'data'


def set_lang_class(name, cls):
    global LANGUAGES
    LANGUAGES[name] = cls


def get_lang_class(name):
    if name in LANGUAGES:
        return LANGUAGES[name]
    lang = re.split('[^a-zA-Z0-9]', name, 1)[0]
    if lang not in LANGUAGES:
        raise RuntimeError('Language not supported: %s' % name)
    return LANGUAGES[lang]


def get_data_path(require_exists=True):
    if not require_exists:
        return _data_path
    else:
        return _data_path if _data_path.exists() else None


def set_data_path(path):
    global _data_path
    _data_path = ensure_path(path)


def ensure_path(path):
    if isinstance(path, basestring_):
        return Path(path)
    else:
        return path


def read_regex(path):
    path = ensure_path(path)
    with path.open() as file_:
        entries = file_.read().split('\n')
    expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
    return re.compile(expression)


def compile_prefix_regex(entries):
    if '(' in entries:
        # Handle deprecated data
        expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
        return re.compile(expression)
    else:
        expression = '|'.join(['^' + piece for piece in entries if piece.strip()])
        return re.compile(expression)


def compile_suffix_regex(entries):
    expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
    return re.compile(expression)


def compile_infix_regex(entries):
    expression = '|'.join([piece for piece in entries if piece.strip()])
    return re.compile(expression)


def normalize_slice(length, start, stop, step=None):
    if not (step is None or step == 1):
        raise ValueError("Stepped slices not supported in Span objects."
                         "Try: list(tokens)[start:stop:step] instead.")
    if start is None:
       start = 0
    elif start < 0:
       start += length
    start = min(length, max(0, start))

    if stop is None:
       stop = length
    elif stop < 0:
       stop += length
    stop = min(length, max(start, stop))

    assert 0 <= start <= stop <= length
    return start, stop


def check_renamed_kwargs(renamed, kwargs):
    for old, new in renamed.items():
        if old in kwargs:
            raise TypeError("Keyword argument %s now renamed to %s" % (old, new))


def read_json(location):
    with location.open('r', encoding='utf8') as f:
        return ujson.load(f)


def is_package(origin):
    """
    Check if string maps to a package installed via pip.
    """
    packages = pip.get_installed_distributions()
    for package in packages:
        if package.project_name.replace('-', '_') == origin:
            return True
    return False


def get_model_package_path(package_name):
    # Here we're importing the module just to find it. This is worryingly
    # indirect, but it's otherwise very difficult to find the package.
    # Python's installation and import rules are very complicated.
    pkg = importlib.import_module(package_name)
    package_path = Path(pkg.__file__).parent.parent
    meta = parse_package_meta(package_path, package_name)
    model_name = '%s-%s' % (package_name, meta['version'])
    return package_path / package_name / model_name


def parse_package_meta(package_path, package, require=True):
    """
    Check if a meta.json exists in a package and return its contents as a
    dictionary. If require is set to True, raise an error if no meta.json found.
    """
    # TODO: Allow passing in full model path and only require one argument
    # instead of path and package name. This lets us avoid passing in an awkward
    # empty string in spacy.load() if user supplies full model path.
    location = package_path / package / 'meta.json'
    if location.is_file():
        return read_json(location)
    elif require:
        raise IOError("Could not read meta.json from %s" % location)
    else:
        return None


def get_raw_input(description, default=False):
    """
    Get user input via raw_input / input and return input value. Takes a
    description, and an optional default value to display with the prompt.
    """
    additional = ' (default: %s)' % default if default else ''
    prompt = '    %s%s: ' % (description, additional)
    user_input = input_(prompt)
    return user_input


def print_table(data, title=None):
    """
    Print data in table format. Can either take a list of tuples or a
    dictionary, which will be converted to a list of tuples.
    """
    if type(data) == dict:
        data = list(data.items())
    tpl_row = '    {:<15}' * len(data[0])
    table = '\n'.join([tpl_row.format(l, v) for l, v in data])
    if title:
        print('\n    \033[93m{}\033[0m'.format(title))
    print('\n{}\n'.format(table))


def print_markdown(data, title=None):
    """
    Print listed data in GitHub-flavoured Markdown format so it can be
    copy-pasted into issues. Can either take a list of tuples or a dictionary.
    """
    def excl_value(value):
        return Path(value).exists() # contains path (personal info)

    if type(data) == dict:
        data = list(data.items())
    markdown = ["* **{}:** {}".format(l, v) for l, v in data if not excl_value(v)]
    if title:
        print("\n## {}".format(title))
    print('\n{}\n'.format('\n'.join(markdown)))


def prints(*texts, **kwargs title=None, exits=False):
    """
    Print formatted message. Each positional argument is rendered as newline-
    separated paragraph. An optional highlighted title is printed above the text
    (using ANSI escape sequences manually to avoid unnecessary dependency).
    """
    exits = kwargs.get('exits', False)
    title = kwargs.get('title', None)
    title = '\033[93m{}\033[0m\n'.format(_wrap(title)) if title else ''
    message = '\n\n'.join([_wrap(text) for text in texts])
    print('\n{}{}\n'.format(title, message))
    if exits:
        sys.exit(0)


def _wrap(text, wrap_max=80, indent=4):
    """
    Wrap text at given width using textwrap module. Indent should consist of
    spaces. Its length is deducted from wrap width to ensure exact wrapping.
    """
    indent = indent * ' '
    wrap_width = wrap_max - len(indent)
    if isinstance(text, Path):
        text = path2str(text)
    return textwrap.fill(text, width=wrap_width, initial_indent=indent,
                         subsequent_indent=indent, break_long_words=False,
                         break_on_hyphens=False)
Use consistent unicode declarations 2017-03-12 12:07:28 +00:00			`# coding: utf8`
Add util functions for printing and wrapping messages 2017-03-15 16:35:57 +00:00			`from __future__ import unicode_literals, print_function`
Clean up imports, unused code, whitespace, docstrings 2017-04-15 10:05:47 +00:00
Fix json imports and use ujson 2017-04-15 10:13:34 +00:00			`import ujson`
Move is_package and get_model_package_path to util 2017-05-07 21:24:51 +00:00			`import pip`
			`import importlib`
Use `regex` instead of `re` 2017-04-19 23:22:52 +00:00			`import regex as re`
Clean up imports, unused code, whitespace, docstrings 2017-04-15 10:05:47 +00:00			`from pathlib import Path`
Move sys_exit() function to util 2017-03-16 16:08:58 +00:00			`import sys`
Add util functions for printing and wrapping messages 2017-03-15 16:35:57 +00:00			`import textwrap`

Tidy up CLI and fix print functions 2017-05-07 21:25:29 +00:00			`from .compat import path2str, basestring_, input_`
Handle raw_input vs input in Python 2 and 3 2017-03-20 21:48:32 +00:00

add lang registration facility 2016-03-25 17:54:45 +00:00			`LANGUAGES = {}`
Clean up imports, unused code, whitespace, docstrings 2017-04-15 10:05:47 +00:00			`_data_path = Path(__file__).parent / 'data'`
add lang registration facility 2016-03-25 17:54:45 +00:00

relative imports in __init__.py 2016-03-26 10:44:53 +00:00			`def set_lang_class(name, cls):`
add lang registration facility 2016-03-25 17:54:45 +00:00			`global LANGUAGES`
			`LANGUAGES[name] = cls`


relative imports in __init__.py 2016-03-26 10:44:53 +00:00			`def get_lang_class(name):`
Check if full string is found in lang classes first This allows users to set arbitrary strings. (Otherwise, custom lang class "my_custom_class" would always load Burmese "my" tokenizer if one was available.) 2017-04-16 20:14:38 +00:00			`if name in LANGUAGES:`
			`return LANGUAGES[name]`
Fix get_lang_class parsing (take 2) 2016-05-16 23:40:31 +00:00			`lang = re.split('[^a-zA-Z0-9]', name, 1)[0]`
add lang registration facility 2016-03-25 17:54:45 +00:00			`if lang not in LANGUAGES:`
Check if full string is found in lang classes first This allows users to set arbitrary strings. (Otherwise, custom lang class "my_custom_class" would always load Burmese "my" tokenizer if one was available.) 2017-04-16 20:14:38 +00:00			`raise RuntimeError('Language not supported: %s' % name)`
add lang registration facility 2016-03-25 17:54:45 +00:00			`return LANGUAGES[lang]`


Unbreak data download 2017-01-09 22:40:26 +00:00			`def get_data_path(require_exists=True):`
			`if not require_exists:`
			`return _data_path`
			`else:`
			`return _data_path if _data_path.exists() else None`
Finish refactoring data loading 2016-09-24 18:26:17 +00:00

			`def set_data_path(path):`
			`global _data_path`
Add compat functions and remove old workarounds Add ensure_path util function to handle checking instance of path 2017-04-15 10:11:16 +00:00			`_data_path = ensure_path(path)`


			`def ensure_path(path):`
			`if isinstance(path, basestring_):`
			`return Path(path)`
			`else:`
			`return path`
Finish refactoring data loading 2016-09-24 18:26:17 +00:00

			`def read_regex(path):`
Add compat functions and remove old workarounds Add ensure_path util function to handle checking instance of path 2017-04-15 10:11:16 +00:00			`path = ensure_path(path)`
Finish refactoring data loading 2016-09-24 18:26:17 +00:00			`with path.open() as file_:`
			`entries = file_.read().split('\n')`
			`expression = '\|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])`
			`return re.compile(expression)`


Refactor so that the tokenizer data is read from Python data, rather than from disk 2016-09-25 12:49:53 +00:00			`def compile_prefix_regex(entries):`
Handle deprecated tokenizer prefix data 2017-01-08 19:33:28 +00:00			`if '(' in entries:`
			`# Handle deprecated data`
			`expression = '\|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])`
			`return re.compile(expression)`
			`else:`
			`expression = '\|'.join(['^' + piece for piece in entries if piece.strip()])`
			`return re.compile(expression)`
Finish refactoring data loading 2016-09-24 18:26:17 +00:00

Refactor so that the tokenizer data is read from Python data, rather than from disk 2016-09-25 12:49:53 +00:00			`def compile_suffix_regex(entries):`
Finish refactoring data loading 2016-09-24 18:26:17 +00:00			`expression = '\|'.join([piece + '$' for piece in entries if piece.strip()])`
			`return re.compile(expression)`


Refactor so that the tokenizer data is read from Python data, rather than from disk 2016-09-25 12:49:53 +00:00			`def compile_infix_regex(entries):`
Finish refactoring data loading 2016-09-24 18:26:17 +00:00			`expression = '\|'.join([piece for piece in entries if piece.strip()])`
			`return re.compile(expression)`


Refactor to remove duplicate slicing logic 2015-10-07 08:25:35 +00:00			`def normalize_slice(length, start, stop, step=None):`
			`if not (step is None or step == 1):`
			`raise ValueError("Stepped slices not supported in Span objects."`
			`"Try: list(tokens)[start:stop:step] instead.")`
			`if start is None:`
			`start = 0`
			`elif start < 0:`
			`start += length`
			`start = min(length, max(0, start))`

			`if stop is None:`
			`stop = length`
			`elif stop < 0:`
			`stop += length`
			`stop = min(length, max(start, stop))`

			`assert 0 <= start <= stop <= length`
			`return start, stop`


Refactor so that the tokenizer data is read from Python data, rather than from disk 2016-09-25 12:49:53 +00:00			`def check_renamed_kwargs(renamed, kwargs):`
			`for old, new in renamed.items():`
			`if old in kwargs:`
			`raise TypeError("Keyword argument %s now renamed to %s" % (old, new))`
Add util functions for printing and wrapping messages 2017-03-15 16:35:57 +00:00

Move read_json out to own util function 2017-04-16 11:03:28 +00:00			`def read_json(location):`
			`with location.open('r', encoding='utf8') as f:`
			`return ujson.load(f)`


Move is_package and get_model_package_path to util 2017-05-07 21:24:51 +00:00			`def is_package(origin):`
			`"""`
			`Check if string maps to a package installed via pip.`
			`"""`
			`packages = pip.get_installed_distributions()`
			`for package in packages:`
			`if package.project_name.replace('-', '_') == origin:`
			`return True`
			`return False`



			`def get_model_package_path(package_name):`
			`# Here we're importing the module just to find it. This is worryingly`
			`# indirect, but it's otherwise very difficult to find the package.`
			`# Python's installation and import rules are very complicated.`
			`pkg = importlib.import_module(package_name)`
			`package_path = Path(pkg.__file__).parent.parent`
			`meta = parse_package_meta(package_path, package_name)`
			`model_name = '%s-%s' % (package_name, meta['version'])`
			`return package_path / package_name / model_name`


Fix loading when no package found 2017-03-16 23:30:02 +00:00			`def parse_package_meta(package_path, package, require=True):`
Add docstring and todo note 2017-04-16 20:14:45 +00:00			`"""`
			`Check if a meta.json exists in a package and return its contents as a`
			`dictionary. If require is set to True, raise an error if no meta.json found.`
			`"""`
			`# TODO: Allow passing in full model path and only require one argument`
			`# instead of path and package name. This lets us avoid passing in an awkward`
			`# empty string in spacy.load() if user supplies full model path.`
Clean up imports, unused code, whitespace, docstrings 2017-04-15 10:05:47 +00:00			`location = package_path / package / 'meta.json'`
			`if location.is_file():`
Move read_json out to own util function 2017-04-16 11:03:28 +00:00			`return read_json(location)`
Fix loading when no package found 2017-03-16 23:30:02 +00:00			`elif require:`
			`raise IOError("Could not read meta.json from %s" % location)`
			`else:`
			`return None`
Add util function to load and parse package meta.json 2017-03-16 16:10:05 +00:00

Add util function to get raw user input 2017-03-20 21:48:56 +00:00			`def get_raw_input(description, default=False):`
Fix formatting 2017-04-16 11:42:34 +00:00			`"""`
			`Get user input via raw_input / input and return input value. Takes a`
Tidy up CLI and fix print functions 2017-05-07 21:25:29 +00:00			`description, and an optional default value to display with the prompt.`
Fix formatting 2017-04-16 11:42:34 +00:00			`"""`
Tidy up CLI and fix print functions 2017-05-07 21:25:29 +00:00			`additional = ' (default: %s)' % default if default else ''`
			`prompt = ' %s%s: ' % (description, additional)`
Add compat functions and remove old workarounds Add ensure_path util function to handle checking instance of path 2017-04-15 10:11:16 +00:00			`user_input = input_(prompt)`
Add util function to get raw user input 2017-03-20 21:48:56 +00:00			`return user_input`


Tidy up CLI and fix print functions 2017-05-07 21:25:29 +00:00			`def print_table(data, title=None):`
Fix formatting 2017-04-16 11:42:34 +00:00			`"""`
			`Print data in table format. Can either take a list of tuples or a`
			`dictionary, which will be converted to a list of tuples.`
			`"""`
Add util functions to print data as table or markdown list 2017-03-18 12:00:14 +00:00			`if type(data) == dict:`
			`data = list(data.items())`
Tidy up CLI and fix print functions 2017-05-07 21:25:29 +00:00			`tpl_row = ' {:<15}' * len(data[0])`
Add util functions to print data as table or markdown list 2017-03-18 12:00:14 +00:00			`table = '\n'.join([tpl_row.format(l, v) for l, v in data])`
Tidy up CLI and fix print functions 2017-05-07 21:25:29 +00:00			`if title:`
			`print('\n \033[93m{}\033[0m'.format(title))`
			`print('\n{}\n'.format(table))`
Add util functions to print data as table or markdown list 2017-03-18 12:00:14 +00:00

Tidy up CLI and fix print functions 2017-05-07 21:25:29 +00:00			`def print_markdown(data, title=None):`
Fix formatting 2017-04-16 11:42:34 +00:00			`"""`
			`Print listed data in GitHub-flavoured Markdown format so it can be`
Tidy up CLI and fix print functions 2017-05-07 21:25:29 +00:00			`copy-pasted into issues. Can either take a list of tuples or a dictionary.`
Fix formatting 2017-04-16 11:42:34 +00:00			`"""`
Add util functions to print data as table or markdown list 2017-03-18 12:00:14 +00:00			`def excl_value(value):`
Tidy up CLI and fix print functions 2017-05-07 21:25:29 +00:00			`return Path(value).exists() # contains path (personal info)`
Add util functions to print data as table or markdown list 2017-03-18 12:00:14 +00:00
			`if type(data) == dict:`
			`data = list(data.items())`
Tidy up CLI and fix print functions 2017-05-07 21:25:29 +00:00			`markdown = ["* {}: {}".format(l, v) for l, v in data if not excl_value(v)]`
			`if title:`
			`print("\n## {}".format(title))`
			`print('\n{}\n'.format('\n'.join(markdown)))`
Add util functions to print data as table or markdown list 2017-03-18 12:00:14 +00:00

Fix kwargs 2017-05-07 23:05:24 +00:00			`def prints(texts, *kwargs title=None, exits=False):`
Fix formatting 2017-04-16 11:42:34 +00:00			`"""`
			`Print formatted message. Each positional argument is rendered as newline-`
Tidy up CLI and fix print functions 2017-05-07 21:25:29 +00:00			`separated paragraph. An optional highlighted title is printed above the text`
			`(using ANSI escape sequences manually to avoid unnecessary dependency).`
Fix formatting 2017-04-16 11:42:34 +00:00			`"""`
Fix kwargs 2017-05-07 23:05:24 +00:00			`exits = kwargs.get('exits', False)`
			`title = kwargs.get('title', None)`
Tidy up CLI and fix print functions 2017-05-07 21:25:29 +00:00			`title = '\033[93m{}\033[0m\n'.format(_wrap(title)) if title else ''`
			`message = '\n\n'.join([_wrap(text) for text in texts])`
			`print('\n{}{}\n'.format(title, message))`
			`if exits:`
			`sys.exit(0)`
Add util functions for printing and wrapping messages 2017-03-15 16:35:57 +00:00

Tidy up CLI and fix print functions 2017-05-07 21:25:29 +00:00			`def _wrap(text, wrap_max=80, indent=4):`
Fix formatting 2017-04-16 11:42:34 +00:00			`"""`
			`Wrap text at given width using textwrap module. Indent should consist of`
			`spaces. Its length is deducted from wrap width to ensure exact wrapping.`
			`"""`
Tidy up CLI and fix print functions 2017-05-07 21:25:29 +00:00			`indent = indent * ' '`
Add util functions for printing and wrapping messages 2017-03-15 16:35:57 +00:00			`wrap_width = wrap_max - len(indent)`
Tidy up CLI and fix print functions 2017-05-07 21:25:29 +00:00			`if isinstance(text, Path):`
			`text = path2str(text)`
Add util functions for printing and wrapping messages 2017-03-15 16:35:57 +00:00			`return textwrap.fill(text, width=wrap_width, initial_indent=indent,`
Tidy up CLI and fix print functions 2017-05-07 21:25:29 +00:00			`subsequent_indent=indent, break_long_words=False,`
			`break_on_hyphens=False)`