spaCy/spacy/util.py

# coding: utf8
from __future__ import unicode_literals, print_function

import ujson
import pip
import importlib
import regex as re
from pathlib import Path
import sys
import textwrap

from .symbols import ORTH
from .compat import path2str, basestring_, input_, unicode_


LANGUAGES = {}
_data_path = Path(__file__).parent / 'data'
try:
    from cupy.cuda.stream import Stream as CudaStream
except ImportError:
    CudaStream = None

try:
    import cupy
except ImportError:
    cupy = None

def get_lang_class(lang):
    """Import and load a Language class.

    lang (unicode): Two-letter language code, e.g. 'en'.
    RETURNS (Language): Language class.
    """
    global LANGUAGES
    if not lang in LANGUAGES:
        try:
            module = importlib.import_module('.lang.%s' % lang, 'spacy')
        except ImportError:
            raise ImportError("Can't import language %s from spacy.lang." %lang)
        LANGUAGES[lang] = getattr(module, module.__all__[0])
    return LANGUAGES[lang]


def set_lang_class(name, cls):
    """Set a custom Language class name that can be loaded via get_lang_class.

    name (unicode): Name of Language class.
    cls (Language): Language class.
    """
    global LANGUAGES
    LANGUAGES[name] = cls


def get_data_path(require_exists=True):
    """Get path to spaCy data directory.

    require_exists (bool): Only return path if it exists, otherwise None.
    RETURNS (Path or None): Data path or None.
    """
    if not require_exists:
        return _data_path
    else:
        return _data_path if _data_path.exists() else None


def set_data_path(path):
    """Set path to spaCy data directory.

    path (unicode or Path): Path to new data directory.
    """
    global _data_path
    _data_path = ensure_path(path)


def ensure_path(path):
    """Ensure string is converted to a Path.

    path: Anything. If string, it's converted to Path.
    RETURNS: Path or original argument.
    """
    if isinstance(path, basestring_):
        return Path(path)
    else:
        return path


def resolve_model_path(name):
    """Resolve a model name or string to a model path.

    name (unicode): Package name, shortcut link or model path.
    RETURNS (Path): Path to model data directory.
    """
    data_path = get_data_path()
    if not data_path or not data_path.exists():
        raise IOError("Can't find spaCy data path: %s" % path2str(data_path))
    if isinstance(name, basestring_):
        if (data_path / name).exists(): # in data dir or shortcut link
            return (data_path / name)
        if is_package(name): # installed as a package
            return get_model_package_path(name)
        if Path(name).exists(): # path to model
            return Path(name)
    elif hasattr(name, 'exists'): # Path or Path-like object
        return name
    raise IOError("Can't find model '%s'" % name)


def is_package(name):
    """Check if string maps to a package installed via pip.

    name (unicode): Name of package.
    RETURNS (bool): True if installed package, False if not.
    """
    packages = pip.get_installed_distributions()
    for package in packages:
        if package.project_name.replace('-', '_') == name:
            return True
    return False


def get_model_package_path(package_name):
    """Get path to a model package installed via pip.

    package_name (unicode): Name of installed package.
    RETURNS (Path): Path to model data directory.
    """
    # Here we're importing the module just to find it. This is worryingly
    # indirect, but it's otherwise very difficult to find the package.
    # Python's installation and import rules are very complicated.
    pkg = importlib.import_module(package_name)
    package_path = Path(pkg.__file__).parent.parent
    meta = parse_package_meta(package_path / package_name)
    model_name = '%s-%s' % (package_name, meta['version'])
    return package_path / package_name / model_name


def parse_package_meta(package_path, require=True):
    """Check if a meta.json exists in a package and return its contents.

    package_path (Path): Path to model package directory.
    require (bool): If True, raise error if no meta.json is found.
    RETURNS (dict or None): Model meta.json data or None.
    """
    location = package_path / 'meta.json'
    if location.is_file():
        return read_json(location)
    elif require:
        raise IOError("Could not read meta.json from %s" % location)
    else:
        return None


def get_cuda_stream(require=False):
    # TODO: Error and tell to install chainer if not found
    # Requires GPU
    return CudaStream() if CudaStream is not None else None


def get_async(stream, numpy_array):
    if cupy is None:
        return numpy_array
    else:
        return cupy.array(numpy_array, stream=stream)


def read_regex(path):
    path = ensure_path(path)
    with path.open() as file_:
        entries = file_.read().split('\n')
    expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
    return re.compile(expression)


def compile_prefix_regex(entries):
    if '(' in entries:
        # Handle deprecated data
        expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
        return re.compile(expression)
    else:
        expression = '|'.join(['^' + piece for piece in entries if piece.strip()])
        return re.compile(expression)


def compile_suffix_regex(entries):
    expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
    return re.compile(expression)


def compile_infix_regex(entries):
    expression = '|'.join([piece for piece in entries if piece.strip()])
    return re.compile(expression)


def update_exc(base_exceptions, *addition_dicts):
    """Update and validate tokenizer exceptions. Will overwrite exceptions.

    base_exceptions (dict): Base exceptions.
    *addition_dicts (dict): Exceptions to add to the base dict, in order.
    RETURNS (dict): Combined tokenizer exceptions.
    """
    exc = dict(base_exceptions)
    for additions in addition_dicts:
        for orth, token_attrs in additions.items():
            if not all(isinstance(attr[ORTH], unicode_) for attr in token_attrs):
                msg = "Invalid value for ORTH in exception: key='%s', orths='%s'"
                raise ValueError(msg % (orth, token_attrs))
            described_orth = ''.join(attr[ORTH] for attr in token_attrs)
            if orth != described_orth:
                raise ValueError("Invalid tokenizer exception: ORTH values "
                                 "combined don't match original string. "
                                 "key='%s', orths='%s'" % (orth, described_orth))
        # overlap = set(exc.keys()).intersection(set(additions))
        # assert not overlap, overlap
        exc.update(additions)
    exc = expand_exc(exc, "'", "’")
    return exc


def expand_exc(excs, search, replace):
    """Find string in tokenizer exceptions, duplicate entry and replace string.
    For example, to add additional versions with typographic apostrophes.

    excs (dict): Tokenizer exceptions.
    search (unicode): String to find and replace.
    replace (unicode): Replacement.
    RETURNS (dict): Combined tokenizer exceptions.
    """
    def _fix_token(token, search, replace):
        fixed = dict(token)
        fixed[ORTH] = fixed[ORTH].replace(search, replace)
        return fixed
    new_excs = dict(excs)
    for token_string, tokens in excs.items():
        if search in token_string:
            new_key = token_string.replace(search, replace)
            new_value = [_fix_token(t, search, replace) for t in tokens]
            new_excs[new_key] = new_value
    return new_excs


def normalize_slice(length, start, stop, step=None):
    if not (step is None or step == 1):
        raise ValueError("Stepped slices not supported in Span objects."
                         "Try: list(tokens)[start:stop:step] instead.")
    if start is None:
       start = 0
    elif start < 0:
       start += length
    start = min(length, max(0, start))

    if stop is None:
       stop = length
    elif stop < 0:
       stop += length
    stop = min(length, max(start, stop))

    assert 0 <= start <= stop <= length
    return start, stop


def check_renamed_kwargs(renamed, kwargs):
    for old, new in renamed.items():
        if old in kwargs:
            raise TypeError("Keyword argument %s now renamed to %s" % (old, new))


def read_json(location):
    """Open and load JSON from file.

    location (Path): Path to JSON file.
    RETURNS (dict): Loaded JSON content.
    """
    with location.open('r', encoding='utf8') as f:
        return ujson.load(f)


def get_raw_input(description, default=False):
    """Get user input from the command line via raw_input / input.

    description (unicode): Text to display before prompt.
    default (unicode or False/None): Default value to display with prompt.
    RETURNS (unicode): User input.
    """
    additional = ' (default: %s)' % default if default else ''
    prompt = '    %s%s: ' % (description, additional)
    user_input = input_(prompt)
    return user_input


def print_table(data, title=None):
    """Print data in table format.

    data (dict or list of tuples): Label/value pairs.
    title (unicode or None): Title, will be printed above.
    """
    if isinstance(data, dict):
        data = list(data.items())
    tpl_row = '    {:<15}' * len(data[0])
    table = '\n'.join([tpl_row.format(l, v) for l, v in data])
    if title:
        print('\n    \033[93m{}\033[0m'.format(title))
    print('\n{}\n'.format(table))


def print_markdown(data, title=None):
    """Print data in GitHub-flavoured Markdown format for issues etc.

    data (dict or list of tuples): Label/value pairs.
    title (unicode or None): Title, will be rendered as headline 2.
    """
    def excl_value(value):
        return Path(value).exists() # contains path (personal info)

    if isinstance(data, dict):
        data = list(data.items())
    markdown = ["* **{}:** {}".format(l, v) for l, v in data if not excl_value(v)]
    if title:
        print("\n## {}".format(title))
    print('\n{}\n'.format('\n'.join(markdown)))


def prints(*texts, **kwargs):
    """Print formatted message (manual ANSI escape sequences to avoid dependency)

    *texts (unicode): Texts to print. Each argument is rendered as paragraph.
    **kwargs: 'title' becomes coloured headline. 'exits'=True performs sys exit.
    """
    exits = kwargs.get('exits', False)
    title = kwargs.get('title', None)
    title = '\033[93m{}\033[0m\n'.format(_wrap(title)) if title else ''
    message = '\n\n'.join([_wrap(text) for text in texts])
    print('\n{}{}\n'.format(title, message))
    if exits:
        sys.exit(0)


def _wrap(text, wrap_max=80, indent=4):
    """Wrap text at given width using textwrap module.

    text (unicode): Text to wrap. If it's a Path, it's converted to string.
    wrap_max (int): Maximum line length (indent is deducted).
    indent (int): Number of spaces for indentation.
    RETURNS (unicode): Wrapped text.
    """
    indent = indent * ' '
    wrap_width = wrap_max - len(indent)
    if isinstance(text, Path):
        text = path2str(text)
    return textwrap.fill(text, width=wrap_width, initial_indent=indent,
                         subsequent_indent=indent, break_long_words=False,
                         break_on_hyphens=False)


def minify_html(html):
    """Perform a template-specific, rudimentary HTML minification for displaCy.
    Disclaimer: NOT a general-purpose solution, only removes indentation/newlines.

    html (unicode): Markup to minify.
    RETURNS (unicode): "Minified" HTML.
    """
    return html.strip().replace('    ', '').replace('\n', '')
-												Use consistent unicode declarations

											
										
										
											2017-03-12 12:07:28 +00:00
+								# coding: utf8
-												Add util functions for printing and wrapping messages

											
										
										
											2017-03-15 16:35:57 +00:00
+								from __future__ import unicode_literals, print_function
-												Clean up imports, unused code, whitespace, docstrings

											
										
										
											2017-04-15 10:05:47 +00:00
-												Fix json imports and use ujson

											
										
										
											2017-04-15 10:13:34 +00:00
+								import ujson
-												Move is_package and get_model_package_path to util

											
										
										
											2017-05-07 21:24:51 +00:00
+								import pip
 								import importlib
-												Use `regex` instead of `re`


											
										
										
											2017-04-19 23:22:52 +00:00
+								import regex as re
-												Clean up imports, unused code, whitespace, docstrings

											
										
										
											2017-04-15 10:05:47 +00:00
+								from pathlib import Path
-												Move sys_exit() function to util

											
										
										
											2017-03-16 16:08:58 +00:00
+								import sys
-												Add util functions for printing and wrapping messages

											
										
										
											2017-03-15 16:35:57 +00:00
+								import textwrap
-												Add update_exc and expand_exc to util

Doesn't require separate language data util anymore

											
										
										
											2017-05-08 13:42:12 +00:00
+								from .symbols import ORTH
 								from .compat import path2str, basestring_, input_, unicode_
-												Handle raw_input vs input in Python 2 and 3

											
										
										
											2017-03-20 21:48:32 +00:00
-												add lang registration facility

											
										
										
											2016-03-25 17:54:45 +00:00
+								LANGUAGES = {}
-												Clean up imports, unused code, whitespace, docstrings

											
										
										
											2017-04-15 10:05:47 +00:00
+								_data_path = Path(__file__).parent / 'data'
-												Improve integration of NN parser, to support unified training API

											
										
										
											2017-05-15 19:46:08 +00:00
+								try:
 								    from cupy.cuda.stream import Stream as CudaStream
 								except ImportError:
 								    CudaStream = None
-												add lang registration facility

											
										
										
											2016-03-25 17:54:45 +00:00
-												Improve integration of NN parser, to support unified training API

											
										
										
											2017-05-15 19:46:08 +00:00
+								try:
 								    import cupy
 								except ImportError:
 								    cupy = None
-												add lang registration facility

											
										
										
											2016-03-25 17:54:45 +00:00
-												Merge load_lang_class and get_lang_class

											
										
										
											2017-05-13 23:31:10 +00:00
+								def get_lang_class(lang):
 								    """Import and load a Language class.
-												add lang registration facility

											
										
										
											2016-03-25 17:54:45 +00:00
-												Merge load_lang_class and get_lang_class

											
										
										
											2017-05-13 23:31:10 +00:00
+								    lang (unicode): Two-letter language code, e.g. 'en'.
 								    RETURNS (Language): Language class.
 								    """
 								    global LANGUAGES
 								    if not lang in LANGUAGES:
 								        try:
 								            module = importlib.import_module('.lang.%s' % lang, 'spacy')
 								        except ImportError:
 								            raise ImportError("Can't import language %s from spacy.lang." %lang)
 								        LANGUAGES[lang] = getattr(module, module.__all__[0])
-												add lang registration facility

											
										
										
											2016-03-25 17:54:45 +00:00
+								    return LANGUAGES[lang]
-												Merge load_lang_class and get_lang_class

											
										
										
											2017-05-13 23:31:10 +00:00
+								def set_lang_class(name, cls):
 								    """Set a custom Language class name that can be loaded via get_lang_class.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 19:22:49 +00:00
-												Merge load_lang_class and get_lang_class

											
										
										
											2017-05-13 23:31:10 +00:00
+								    name (unicode): Name of Language class.
 								    cls (Language): Language class.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 19:22:49 +00:00
+								    """
-												Merge load_lang_class and get_lang_class

											
										
										
											2017-05-13 23:31:10 +00:00
+								    global LANGUAGES
 								    LANGUAGES[name] = cls
-												Add load_lang_class() util function

											
										
										
											2017-05-08 21:50:45 +00:00
-												Unbreak data download

											
										
										
											2017-01-09 22:40:26 +00:00
+								def get_data_path(require_exists=True):
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 19:22:49 +00:00
+								    """Get path to spaCy data directory.
-												Update docstrings

											
										
										
											2017-05-13 23:30:29 +00:00
+								    require_exists (bool): Only return path if it exists, otherwise None.
 								    RETURNS (Path or None): Data path or None.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 19:22:49 +00:00
+								    """
-												Unbreak data download

											
										
										
											2017-01-09 22:40:26 +00:00
+								    if not require_exists:
 								        return _data_path
 								    else:
 								        return _data_path if _data_path.exists() else None
-												Finish refactoring data loading

											
										
										
											2016-09-24 18:26:17 +00:00
 								def set_data_path(path):
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 19:22:49 +00:00
+								    """Set path to spaCy data directory.
-												Update docstrings

											
										
										
											2017-05-13 23:30:29 +00:00
+								    path (unicode or Path): Path to new data directory.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 19:22:49 +00:00
+								    """
-												Finish refactoring data loading

											
										
										
											2016-09-24 18:26:17 +00:00
+								    global _data_path
-												Add compat functions and remove old workarounds

Add ensure_path util function to handle checking instance of path

											
										
										
											2017-04-15 10:11:16 +00:00
+								    _data_path = ensure_path(path)
 								def ensure_path(path):
-												Update docstrings

											
										
										
											2017-05-13 23:30:29 +00:00
+								    """Ensure string is converted to a Path.
 								    path: Anything. If string, it's converted to Path.
 								    RETURNS: Path or original argument.
 								    """
-												Add compat functions and remove old workarounds

Add ensure_path util function to handle checking instance of path

											
										
										
											2017-04-15 10:11:16 +00:00
+								    if isinstance(path, basestring_):
 								        return Path(path)
 								    else:
 								        return path
-												Finish refactoring data loading

											
										
										
											2016-09-24 18:26:17 +00:00
-												Reorder util functions

											
										
										
											2017-05-08 21:51:15 +00:00
+								def resolve_model_path(name):
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 19:22:49 +00:00
+								    """Resolve a model name or string to a model path.
-												Update docstrings

											
										
										
											2017-05-13 23:30:29 +00:00
+								    name (unicode): Package name, shortcut link or model path.
 								    RETURNS (Path): Path to model data directory.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 19:22:49 +00:00
+								    """
-												Reorder util functions

											
										
										
											2017-05-08 21:51:15 +00:00
+								    data_path = get_data_path()
 								    if not data_path or not data_path.exists():
 								        raise IOError("Can't find spaCy data path: %s" % path2str(data_path))
 								    if isinstance(name, basestring_):
 								        if (data_path / name).exists(): # in data dir or shortcut link
 								            return (data_path / name)
 								        if is_package(name): # installed as a package
 								            return get_model_package_path(name)
 								        if Path(name).exists(): # path to model
 								            return Path(name)
 								    elif hasattr(name, 'exists'): # Path or Path-like object
 								        return name
 								    raise IOError("Can't find model '%s'" % name)
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 19:22:49 +00:00
+								def is_package(name):
 								    """Check if string maps to a package installed via pip.
-												Update docstrings

											
										
										
											2017-05-13 23:30:29 +00:00
+								    name (unicode): Name of package.
 								    RETURNS (bool): True if installed package, False if not.
-												Reorder util functions

											
										
										
											2017-05-08 21:51:15 +00:00
+								    """
 								    packages = pip.get_installed_distributions()
 								    for package in packages:
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 19:22:49 +00:00
+								        if package.project_name.replace('-', '_') == name:
-												Reorder util functions

											
										
										
											2017-05-08 21:51:15 +00:00
+								            return True
 								    return False
 								def get_model_package_path(package_name):
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 19:22:49 +00:00
+								    """Get path to a model package installed via pip.
-												Update docstrings

											
										
										
											2017-05-13 23:30:29 +00:00
+								    package_name (unicode): Name of installed package.
 								    RETURNS (Path): Path to model data directory.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 19:22:49 +00:00
+								    """
-												Reorder util functions

											
										
										
											2017-05-08 21:51:15 +00:00
+								    # Here we're importing the module just to find it. This is worryingly
 								    # indirect, but it's otherwise very difficult to find the package.
 								    # Python's installation and import rules are very complicated.
 								    pkg = importlib.import_module(package_name)
 								    package_path = Path(pkg.__file__).parent.parent
 								    meta = parse_package_meta(package_path / package_name)
 								    model_name = '%s-%s' % (package_name, meta['version'])
 								    return package_path / package_name / model_name
 								def parse_package_meta(package_path, require=True):
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 19:22:49 +00:00
+								    """Check if a meta.json exists in a package and return its contents.
-												Update docstrings

											
										
										
											2017-05-13 23:30:29 +00:00
+								    package_path (Path): Path to model package directory.
 								    require (bool): If True, raise error if no meta.json is found.
 								    RETURNS (dict or None): Model meta.json data or None.
-												Reorder util functions

											
										
										
											2017-05-08 21:51:15 +00:00
+								    """
 								    location = package_path / 'meta.json'
 								    if location.is_file():
 								        return read_json(location)
 								    elif require:
 								        raise IOError("Could not read meta.json from %s" % location)
 								    else:
 								        return None
-												Remove cupy imports from parser, so it can work on CPU

											
										
										
											2017-05-13 22:37:53 +00:00
+								def get_cuda_stream(require=False):
 								    # TODO: Error and tell to install chainer if not found
 								    # Requires GPU
-												Improve integration of NN parser, to support unified training API

											
										
										
											2017-05-15 19:46:08 +00:00
+								    return CudaStream() if CudaStream is not None else None
 								def get_async(stream, numpy_array):
 								    if cupy is None:
 								        return numpy_array
 								    else:
 								        return cupy.array(numpy_array, stream=stream)
-												Remove cupy imports from parser, so it can work on CPU

											
										
										
											2017-05-13 22:37:53 +00:00
-												Finish refactoring data loading

											
										
										
											2016-09-24 18:26:17 +00:00
+								def read_regex(path):
-												Add compat functions and remove old workarounds

Add ensure_path util function to handle checking instance of path

											
										
										
											2017-04-15 10:11:16 +00:00
+								    path = ensure_path(path)
-												Finish refactoring data loading

											
										
										
											2016-09-24 18:26:17 +00:00
+								    with path.open() as file_:
 								        entries = file_.read().split('\n')
 								    expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
 								    return re.compile(expression)
-												Refactor so that the tokenizer data is read from Python data, rather than from disk

											
										
										
											2016-09-25 12:49:53 +00:00
+								def compile_prefix_regex(entries):
-												Handle deprecated tokenizer prefix data

											
										
										
											2017-01-08 19:33:28 +00:00
+								    if '(' in entries:
 								        # Handle deprecated data
 								        expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
 								        return re.compile(expression)
 								    else:
 								        expression = '|'.join(['^' + piece for piece in entries if piece.strip()])
 								        return re.compile(expression)
-												Finish refactoring data loading

											
										
										
											2016-09-24 18:26:17 +00:00
-												Refactor so that the tokenizer data is read from Python data, rather than from disk

											
										
										
											2016-09-25 12:49:53 +00:00
+								def compile_suffix_regex(entries):
-												Finish refactoring data loading

											
										
										
											2016-09-24 18:26:17 +00:00
+								    expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
 								    return re.compile(expression)
-												Refactor so that the tokenizer data is read from Python data, rather than from disk

											
										
										
											2016-09-25 12:49:53 +00:00
+								def compile_infix_regex(entries):
-												Finish refactoring data loading

											
										
										
											2016-09-24 18:26:17 +00:00
+								    expression = '|'.join([piece for piece in entries if piece.strip()])
 								    return re.compile(expression)
-												Add update_exc and expand_exc to util

Doesn't require separate language data util anymore

											
										
										
											2017-05-08 13:42:12 +00:00
+								def update_exc(base_exceptions, *addition_dicts):
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 19:22:49 +00:00
+								    """Update and validate tokenizer exceptions. Will overwrite exceptions.
-												Update docstrings

											
										
										
											2017-05-13 23:30:29 +00:00
+								    base_exceptions (dict): Base exceptions.
 								    *addition_dicts (dict): Exceptions to add to the base dict, in order.
 								    RETURNS (dict): Combined tokenizer exceptions.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 19:22:49 +00:00
+								    """
-												Add update_exc and expand_exc to util

Doesn't require separate language data util anymore

											
										
										
											2017-05-08 13:42:12 +00:00
+								    exc = dict(base_exceptions)
 								    for additions in addition_dicts:
 								        for orth, token_attrs in additions.items():
 								            if not all(isinstance(attr[ORTH], unicode_) for attr in token_attrs):
 								                msg = "Invalid value for ORTH in exception: key='%s', orths='%s'"
 								                raise ValueError(msg % (orth, token_attrs))
 								            described_orth = ''.join(attr[ORTH] for attr in token_attrs)
 								            if orth != described_orth:
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 19:22:49 +00:00
+								                raise ValueError("Invalid tokenizer exception: ORTH values "
 								                                 "combined don't match original string. "
 								                                 "key='%s', orths='%s'" % (orth, described_orth))
-												Add update_exc and expand_exc to util

Doesn't require separate language data util anymore

											
										
										
											2017-05-08 13:42:12 +00:00
+								        # overlap = set(exc.keys()).intersection(set(additions))
 								        # assert not overlap, overlap
 								        exc.update(additions)
-												Fix expand_exc to make sure it returns combined dict

											
										
										
											2017-05-13 19:22:25 +00:00
+								    exc = expand_exc(exc, "'", "’")
-												Add update_exc and expand_exc to util

Doesn't require separate language data util anymore

											
										
										
											2017-05-08 13:42:12 +00:00
+								    return exc
 								def expand_exc(excs, search, replace):
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 19:22:49 +00:00
+								    """Find string in tokenizer exceptions, duplicate entry and replace string.
 								    For example, to add additional versions with typographic apostrophes.
-												Update docstrings

											
										
										
											2017-05-13 23:30:29 +00:00
+								    excs (dict): Tokenizer exceptions.
 								    search (unicode): String to find and replace.
 								    replace (unicode): Replacement.
 								    RETURNS (dict): Combined tokenizer exceptions.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 19:22:49 +00:00
+								    """
-												Add update_exc and expand_exc to util

Doesn't require separate language data util anymore

											
										
										
											2017-05-08 13:42:12 +00:00
+								    def _fix_token(token, search, replace):
 								        fixed = dict(token)
 								        fixed[ORTH] = fixed[ORTH].replace(search, replace)
 								        return fixed
-												Fix expand_exc to make sure it returns combined dict

											
										
										
											2017-05-13 19:22:25 +00:00
+								    new_excs = dict(excs)
-												Add update_exc and expand_exc to util

Doesn't require separate language data util anymore

											
										
										
											2017-05-08 13:42:12 +00:00
+								    for token_string, tokens in excs.items():
 								        if search in token_string:
 								            new_key = token_string.replace(search, replace)
 								            new_value = [_fix_token(t, search, replace) for t in tokens]
-												Fix expand_exc to make sure it returns combined dict

											
										
										
											2017-05-13 19:22:25 +00:00
+								            new_excs[new_key] = new_value
 								    return new_excs
-												Add update_exc and expand_exc to util

Doesn't require separate language data util anymore

											
										
										
											2017-05-08 13:42:12 +00:00
-												Refactor to remove duplicate slicing logic

											
										
										
											2015-10-07 08:25:35 +00:00
+								def normalize_slice(length, start, stop, step=None):
 								    if not (step is None or step == 1):
 								        raise ValueError("Stepped slices not supported in Span objects."
 								                         "Try: list(tokens)[start:stop:step] instead.")
 								    if start is None:
 								       start = 0
 								    elif start < 0:
 								       start += length
 								    start = min(length, max(0, start))
 								    if stop is None:
 								       stop = length
 								    elif stop < 0:
 								       stop += length
 								    stop = min(length, max(start, stop))
 								    assert 0 <= start <= stop <= length
 								    return start, stop
-												Refactor so that the tokenizer data is read from Python data, rather than from disk

											
										
										
											2016-09-25 12:49:53 +00:00
+								def check_renamed_kwargs(renamed, kwargs):
 								    for old, new in renamed.items():
 								        if old in kwargs:
 								            raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
-												Add util functions for printing and wrapping messages

											
										
										
											2017-03-15 16:35:57 +00:00
-												Move read_json out to own util function

											
										
										
											2017-04-16 11:03:28 +00:00
+								def read_json(location):
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 19:22:49 +00:00
+								    """Open and load JSON from file.
-												Update docstrings

											
										
										
											2017-05-13 23:30:29 +00:00
+								    location (Path): Path to JSON file.
 								    RETURNS (dict): Loaded JSON content.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 19:22:49 +00:00
+								    """
-												Move read_json out to own util function

											
										
										
											2017-04-16 11:03:28 +00:00
+								    with location.open('r', encoding='utf8') as f:
 								        return ujson.load(f)
-												Add util function to get raw user input

											
										
										
											2017-03-20 21:48:56 +00:00
+								def get_raw_input(description, default=False):
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 19:22:49 +00:00
+								    """Get user input from the command line via raw_input / input.
-												Update docstrings

											
										
										
											2017-05-13 23:30:29 +00:00
+								    description (unicode): Text to display before prompt.
 								    default (unicode or False/None): Default value to display with prompt.
 								    RETURNS (unicode): User input.
-												Fix formatting

											
										
										
											2017-04-16 11:42:34 +00:00
+								    """
-												Tidy up CLI and fix print functions

											
										
										
											2017-05-07 21:25:29 +00:00
+								    additional = ' (default: %s)' % default if default else ''
 								    prompt = '    %s%s: ' % (description, additional)
-												Add compat functions and remove old workarounds

Add ensure_path util function to handle checking instance of path

											
										
										
											2017-04-15 10:11:16 +00:00
+								    user_input = input_(prompt)
-												Add util function to get raw user input

											
										
										
											2017-03-20 21:48:56 +00:00
+								    return user_input
-												Tidy up CLI and fix print functions

											
										
										
											2017-05-07 21:25:29 +00:00
+								def print_table(data, title=None):
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 19:22:49 +00:00
+								    """Print data in table format.
-												Update docstrings

											
										
										
											2017-05-13 23:30:29 +00:00
+								    data (dict or list of tuples): Label/value pairs.
 								    title (unicode or None): Title, will be printed above.
-												Fix formatting

											
										
										
											2017-04-16 11:42:34 +00:00
+								    """
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 19:22:49 +00:00
+								    if isinstance(data, dict):
-												Add util functions to print data as table or markdown list

											
										
										
											2017-03-18 12:00:14 +00:00
+								        data = list(data.items())
-												Tidy up CLI and fix print functions

											
										
										
											2017-05-07 21:25:29 +00:00
+								    tpl_row = '    {:<15}' * len(data[0])
-												Add util functions to print data as table or markdown list

											
										
										
											2017-03-18 12:00:14 +00:00
+								    table = '\n'.join([tpl_row.format(l, v) for l, v in data])
-												Tidy up CLI and fix print functions

											
										
										
											2017-05-07 21:25:29 +00:00
+								    if title:
 								        print('\n    \033[93m{}\033[0m'.format(title))
 								    print('\n{}\n'.format(table))
-												Add util functions to print data as table or markdown list

											
										
										
											2017-03-18 12:00:14 +00:00
-												Tidy up CLI and fix print functions

											
										
										
											2017-05-07 21:25:29 +00:00
+								def print_markdown(data, title=None):
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 19:22:49 +00:00
+								    """Print data in GitHub-flavoured Markdown format for issues etc.
-												Update docstrings

											
										
										
											2017-05-13 23:30:29 +00:00
+								    data (dict or list of tuples): Label/value pairs.
 								    title (unicode or None): Title, will be rendered as headline 2.
-												Fix formatting

											
										
										
											2017-04-16 11:42:34 +00:00
+								    """
-												Add util functions to print data as table or markdown list

											
										
										
											2017-03-18 12:00:14 +00:00
+								    def excl_value(value):
-												Tidy up CLI and fix print functions

											
										
										
											2017-05-07 21:25:29 +00:00
+								        return Path(value).exists() # contains path (personal info)
-												Add util functions to print data as table or markdown list

											
										
										
											2017-03-18 12:00:14 +00:00
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 19:22:49 +00:00
+								    if isinstance(data, dict):
-												Add util functions to print data as table or markdown list

											
										
										
											2017-03-18 12:00:14 +00:00
+								        data = list(data.items())
-												Tidy up CLI and fix print functions

											
										
										
											2017-05-07 21:25:29 +00:00
+								    markdown = ["* **{}:** {}".format(l, v) for l, v in data if not excl_value(v)]
 								    if title:
 								        print("\n## {}".format(title))
 								    print('\n{}\n'.format('\n'.join(markdown)))
-												Add util functions to print data as table or markdown list

											
										
										
											2017-03-18 12:00:14 +00:00
-												Fix typo

											
										
										
											2017-05-08 00:00:37 +00:00
+								def prints(*texts, **kwargs):
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 19:22:49 +00:00
+								    """Print formatted message (manual ANSI escape sequences to avoid dependency)
-												Update docstrings

											
										
										
											2017-05-13 23:30:29 +00:00
+								    *texts (unicode): Texts to print. Each argument is rendered as paragraph.
 								    **kwargs: 'title' becomes coloured headline. 'exits'=True performs sys exit.
-												Fix formatting

											
										
										
											2017-04-16 11:42:34 +00:00
+								    """
-												Fix kwargs

											
										
										
											2017-05-07 23:05:24 +00:00
+								    exits = kwargs.get('exits', False)
 								    title = kwargs.get('title', None)
-												Tidy up CLI and fix print functions

											
										
										
											2017-05-07 21:25:29 +00:00
+								    title = '\033[93m{}\033[0m\n'.format(_wrap(title)) if title else ''
 								    message = '\n\n'.join([_wrap(text) for text in texts])
 								    print('\n{}{}\n'.format(title, message))
 								    if exits:
 								        sys.exit(0)
-												Add util functions for printing and wrapping messages

											
										
										
											2017-03-15 16:35:57 +00:00
-												Tidy up CLI and fix print functions

											
										
										
											2017-05-07 21:25:29 +00:00
+								def _wrap(text, wrap_max=80, indent=4):
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 19:22:49 +00:00
+								    """Wrap text at given width using textwrap module.
-												Update docstrings

											
										
										
											2017-05-13 23:30:29 +00:00
+								    text (unicode): Text to wrap. If it's a Path, it's converted to string.
 								    wrap_max (int): Maximum line length (indent is deducted).
 								    indent (int): Number of spaces for indentation.
 								    RETURNS (unicode): Wrapped text.
-												Fix formatting

											
										
										
											2017-04-16 11:42:34 +00:00
+								    """
-												Tidy up CLI and fix print functions

											
										
										
											2017-05-07 21:25:29 +00:00
+								    indent = indent * ' '
-												Add util functions for printing and wrapping messages

											
										
										
											2017-03-15 16:35:57 +00:00
+								    wrap_width = wrap_max - len(indent)
-												Tidy up CLI and fix print functions

											
										
										
											2017-05-07 21:25:29 +00:00
+								    if isinstance(text, Path):
 								        text = path2str(text)
-												Add util functions for printing and wrapping messages

											
										
										
											2017-03-15 16:35:57 +00:00
+								    return textwrap.fill(text, width=wrap_width, initial_indent=indent,
-												Tidy up CLI and fix print functions

											
										
										
											2017-05-07 21:25:29 +00:00
+								                         subsequent_indent=indent, break_long_words=False,
 								                         break_on_hyphens=False)
-												Add displaCy visualisers (see #1058)

											
										
										
											2017-05-14 15:50:23 +00:00
 								def minify_html(html):
 								    """Perform a template-specific, rudimentary HTML minification for displaCy.
 								    Disclaimer: NOT a general-purpose solution, only removes indentation/newlines.
 								    html (unicode): Markup to minify.
 								    RETURNS (unicode): "Minified" HTML.
 								    """
 								    return html.strip().replace('    ', '').replace('\n', '')