spaCy/spacy/util.py

import os
import io
import json
import re
import os.path
from contextlib import contextmanager
import types

from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE


def local_path(*dirs):
    return os.path.abspath(os.path.join(os.path.dirname(__file__), *dirs))


class Package(object):
    @classmethod
    def create_or_return(cls, me_or_arg):
        return me_or_arg if isinstance(me_or_arg, cls) else cls(me_or_arg)

    def __init__(self, data_path=None, model='en_default-1.0.3'):
        if data_path is None:
            data_path = local_path('data', model)
        self.model = model
        self.data_path = data_path
        self._root = self.data_path

    def get(self, key):
        pass

    def has_file(self, *path_parts):
        return os.path.exists(os.path.join(self._root, *path_parts))

    def file_path(self, *path_parts, **kwargs):
        return os.path.join(self._root, *path_parts)

    def dir_path(self, *path_parts, **kwargs):
        return os.path.join(self._root, *path_parts)

    def load_json(self, path_parts, default=None):
        if not self.has_file(*path_parts):
            if _is_error_class(default):
                raise default(self.file_path(*path_parts))
            elif isinstance(default, Exception):
                raise default
            else:
                return default
        with io.open(self.file_path(os.path.join(*path_parts)),
                      mode='r', encoding='utf8') as file_:
            return json.load(file_)
    
    @contextmanager
    def open(self, path_parts, mode='r', encoding='utf8', default=IOError):
        if not self.has_file(*path_parts):
            if _is_error_class(default):
                raise default(self.file_path(*path_parts))
            elif isinstance(default, Exception):
                raise default
            else:
                yield default
        else:
            # Enter
            file_ = io.open(self.file_path(os.path.join(*path_parts)),
                            mode=mode, encoding='utf8')
            yield file_
            # Exit
            file_.close()


def _is_error_class(e):
    return isinstance(e, types.TypeType) and issubclass(e, Exception)


def get_package(name=None, data_path=None):
    return Package(data_path)


def normalize_slice(length, start, stop, step=None):
    if not (step is None or step == 1):
        raise ValueError("Stepped slices not supported in Span objects."
                         "Try: list(tokens)[start:stop:step] instead.")
    if start is None:
       start = 0
    elif start < 0:
       start += length
    start = min(length, max(0, start))

    if stop is None:
       stop = length
    elif stop < 0:
       stop += length
    stop = min(length, max(start, stop))

    assert 0 <= start <= stop <= length
    return start, stop


def utf8open(loc, mode='r'):
    return io.open(loc, mode, encoding='utf8')


def read_lang_data(package):
    tokenization = package.load_json(('tokenizer', 'specials.json'))
    with package.open(('tokenizer', 'prefix.txt'), default=None) as file_:
        prefix = read_prefix(file_) if file_ is not None else None
    with package.open(('tokenizer', 'suffix.txt'), default=None) as file_:
        suffix = read_suffix(file_) if file_ is not None else None
    with package.open(('tokenizer', 'infix.txt'), default=None) as file_:
        infix = read_infix(file_) if file_ is not None else None
    return tokenization, prefix, suffix, infix


def read_prefix(fileobj):
    entries = fileobj.read().split('\n')
    expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
    return expression


def read_suffix(fileobj):
    entries = fileobj.read().split('\n')
    expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
    return expression


def read_infix(fileobj):
    entries = fileobj.read().split('\n')
    expression = '|'.join([piece for piece in entries if piece.strip()])
    return expression


# def read_tokenization(lang):
#     loc = path.join(DATA_DIR, lang, 'tokenization')
#     entries = []
#     seen = set()
#     with utf8open(loc) as file_:
#         for line in file_:
#             line = line.strip()
#             if line.startswith('#'):
#                 continue
#             if not line:
#                 continue
#             pieces = line.split()
#             chunk = pieces.pop(0)
#             assert chunk not in seen, chunk
#             seen.add(chunk)
#             entries.append((chunk, list(pieces)))
#             if chunk[0].isalpha() and chunk[0].islower():
#                 chunk = chunk[0].title() + chunk[1:]
#                 pieces[0] = pieces[0][0].title() + pieces[0][1:]
#                 seen.add(chunk)
#                 entries.append((chunk, pieces))
#     return entries


# def read_detoken_rules(lang): # Deprecated?
#     loc = path.join(DATA_DIR, lang, 'detokenize')
#     entries = []
#     with utf8open(loc) as file_:
#         for line in file_:
#             entries.append(line.strip())
#     return entries


def align_tokens(ref, indices): # Deprecated, surely?
    start = 0
    queue = list(indices)
    for token in ref:
        end = start + len(token)
        emit = []
        while queue and queue[0][1] <= end:
            emit.append(queue.pop(0))
        yield token, emit
        start = end
    assert not queue


def detokenize(token_rules, words): # Deprecated?
    """To align with treebanks, return a list of "chunks", where a chunk is a
    sequence of tokens that are separated by whitespace in actual strings. Each
    chunk should be a tuple of token indices, e.g.

    >>> detokenize(["ca<SEP>n't", '<SEP>!'], ["I", "ca", "n't", "!"])
    [(0,), (1, 2, 3)]
    """
    string = ' '.join(words)
    for subtoks in token_rules:
        # Algorithmically this is dumb, but writing a little list-based match
        # machine? Ain't nobody got time for that.
        string = string.replace(subtoks.replace('<SEP>', ' '), subtoks)
    positions = []
    i = 0
    for chunk in string.split():
        subtoks = chunk.split('<SEP>')
        positions.append(tuple(range(i, i+len(subtoks))))
        i += len(subtoks)
    return positions
access model via sputnik 2015-12-07 05:01:28 +00:00			`import os`
changing deprecated codecs.open to io.open =) 2015-09-30 18:10:15 +00:00			`import io`
* Make PyPy work 2015-01-05 06:54:13 +00:00			`import json`
* Add util.py 2014-09-25 16:26:22 +00:00			`import re`
* Add MockPackage class, to see whether we can proxy for Sputnik in a lightweight way 2015-12-29 15:55:03 +00:00			`import os.path`
			`from contextlib import contextmanager`
Use util.Package class for io Previous Sputnik integration caused API change: Vocab, Tagger, etc were loaded via a from_package classmethod, that required a sputnik.Package instance. This forced users to first create a sputnik.Sputnik() instance, in order to acquire a Package via sp.pool(). Instead I've created a small file-system shim, util.Package, which allows classes to have a .load() classmethod, that accepts either util.Package objects, or strings. We can later gut the internals of this and make it a proxy for Sputnik if we need more functionality that should live in the Sputnik library. Sputnik is now only used to download and install the data, in spacy.en.download 2015-12-29 17:00:48 +00:00			`import types`
access model via sputnik 2015-12-07 05:01:28 +00:00
* Serialization round trip now working with decent API, but with rough spots in the organisation and requiring vocabulary to be fixed ahead of time. 2015-07-19 13:18:17 +00:00			`from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE`
* Add util.py 2014-09-25 16:26:22 +00:00
access model via sputnik 2015-12-07 05:01:28 +00:00
* Fix use of mock Package object 2015-12-31 03:13:15 +00:00			`def local_path(*dirs):`
			`return os.path.abspath(os.path.join(os.path.dirname(__file__), *dirs))`
* Add MockPackage class, to see whether we can proxy for Sputnik in a lightweight way 2015-12-29 15:55:03 +00:00

Use util.Package class for io Previous Sputnik integration caused API change: Vocab, Tagger, etc were loaded via a from_package classmethod, that required a sputnik.Package instance. This forced users to first create a sputnik.Sputnik() instance, in order to acquire a Package via sp.pool(). Instead I've created a small file-system shim, util.Package, which allows classes to have a .load() classmethod, that accepts either util.Package objects, or strings. We can later gut the internals of this and make it a proxy for Sputnik if we need more functionality that should live in the Sputnik library. Sputnik is now only used to download and install the data, in spacy.en.download 2015-12-29 17:00:48 +00:00			`class Package(object):`
* Add MockPackage class, to see whether we can proxy for Sputnik in a lightweight way 2015-12-29 15:55:03 +00:00			`@classmethod`
			`def create_or_return(cls, me_or_arg):`
Use util.Package class for io Previous Sputnik integration caused API change: Vocab, Tagger, etc were loaded via a from_package classmethod, that required a sputnik.Package instance. This forced users to first create a sputnik.Sputnik() instance, in order to acquire a Package via sp.pool(). Instead I've created a small file-system shim, util.Package, which allows classes to have a .load() classmethod, that accepts either util.Package objects, or strings. We can later gut the internals of this and make it a proxy for Sputnik if we need more functionality that should live in the Sputnik library. Sputnik is now only used to download and install the data, in spacy.en.download 2015-12-29 17:00:48 +00:00			`return me_or_arg if isinstance(me_or_arg, cls) else cls(me_or_arg)`
* Add MockPackage class, to see whether we can proxy for Sputnik in a lightweight way 2015-12-29 15:55:03 +00:00
* Fix use of mock Package object 2015-12-31 03:13:15 +00:00			`def __init__(self, data_path=None, model='en_default-1.0.3'):`
* Add MockPackage class, to see whether we can proxy for Sputnik in a lightweight way 2015-12-29 15:55:03 +00:00			`if data_path is None:`
* Fix use of mock Package object 2015-12-31 03:13:15 +00:00			`data_path = local_path('data', model)`
			`self.model = model`
* Add MockPackage class, to see whether we can proxy for Sputnik in a lightweight way 2015-12-29 15:55:03 +00:00			`self.data_path = data_path`
			`self._root = self.data_path`

			`def get(self, key):`
			`pass`

			`def has_file(self, *path_parts):`
			`return os.path.exists(os.path.join(self._root, *path_parts))`

			`def file_path(self, path_parts, *kwargs):`
			`return os.path.join(self._root, *path_parts)`

			`def dir_path(self, path_parts, *kwargs):`
			`return os.path.join(self._root, *path_parts)`

* Fix use of mock Package object 2015-12-31 03:13:15 +00:00			`def load_json(self, path_parts, default=None):`
			`if not self.has_file(*path_parts):`
			`if _is_error_class(default):`
			`raise default(self.file_path(*path_parts))`
			`elif isinstance(default, Exception):`
			`raise default`
			`else:`
			`return default`
			`with io.open(self.file_path(os.path.join(*path_parts)),`
			`mode='r', encoding='utf8') as file_:`
			`return json.load(file_)`
* Add MockPackage class, to see whether we can proxy for Sputnik in a lightweight way 2015-12-29 15:55:03 +00:00
			`@contextmanager`
* Fix use of mock Package object 2015-12-31 03:13:15 +00:00			`def open(self, path_parts, mode='r', encoding='utf8', default=IOError):`
Use util.Package class for io Previous Sputnik integration caused API change: Vocab, Tagger, etc were loaded via a from_package classmethod, that required a sputnik.Package instance. This forced users to first create a sputnik.Sputnik() instance, in order to acquire a Package via sp.pool(). Instead I've created a small file-system shim, util.Package, which allows classes to have a .load() classmethod, that accepts either util.Package objects, or strings. We can later gut the internals of this and make it a proxy for Sputnik if we need more functionality that should live in the Sputnik library. Sputnik is now only used to download and install the data, in spacy.en.download 2015-12-29 17:00:48 +00:00			`if not self.has_file(*path_parts):`
* Fix use of mock Package object 2015-12-31 03:13:15 +00:00			`if _is_error_class(default):`
Use util.Package class for io Previous Sputnik integration caused API change: Vocab, Tagger, etc were loaded via a from_package classmethod, that required a sputnik.Package instance. This forced users to first create a sputnik.Sputnik() instance, in order to acquire a Package via sp.pool(). Instead I've created a small file-system shim, util.Package, which allows classes to have a .load() classmethod, that accepts either util.Package objects, or strings. We can later gut the internals of this and make it a proxy for Sputnik if we need more functionality that should live in the Sputnik library. Sputnik is now only used to download and install the data, in spacy.en.download 2015-12-29 17:00:48 +00:00			`raise default(self.file_path(*path_parts))`
			`elif isinstance(default, Exception):`
			`raise default`
			`else:`
			`yield default`
			`else:`
			`# Enter`
			`file_ = io.open(self.file_path(os.path.join(*path_parts)),`
* Fix use of mock Package object 2015-12-31 03:13:15 +00:00			`mode=mode, encoding='utf8')`
Use util.Package class for io Previous Sputnik integration caused API change: Vocab, Tagger, etc were loaded via a from_package classmethod, that required a sputnik.Package instance. This forced users to first create a sputnik.Sputnik() instance, in order to acquire a Package via sp.pool(). Instead I've created a small file-system shim, util.Package, which allows classes to have a .load() classmethod, that accepts either util.Package objects, or strings. We can later gut the internals of this and make it a proxy for Sputnik if we need more functionality that should live in the Sputnik library. Sputnik is now only used to download and install the data, in spacy.en.download 2015-12-29 17:00:48 +00:00			`yield file_`
			`# Exit`
			`file_.close()`
* Add MockPackage class, to see whether we can proxy for Sputnik in a lightweight way 2015-12-29 15:55:03 +00:00

* Fix use of mock Package object 2015-12-31 03:13:15 +00:00			`def _is_error_class(e):`
			`return isinstance(e, types.TypeType) and issubclass(e, Exception)`


* Add MockPackage class, to see whether we can proxy for Sputnik in a lightweight way 2015-12-29 15:55:03 +00:00			`def get_package(name=None, data_path=None):`
* Clean up spacy.util 2015-12-29 17:06:09 +00:00			`return Package(data_path)`
* Add util.py 2014-09-25 16:26:22 +00:00

Refactor to remove duplicate slicing logic 2015-10-07 08:25:35 +00:00			`def normalize_slice(length, start, stop, step=None):`
			`if not (step is None or step == 1):`
			`raise ValueError("Stepped slices not supported in Span objects."`
			`"Try: list(tokens)[start:stop:step] instead.")`
			`if start is None:`
			`start = 0`
			`elif start < 0:`
			`start += length`
			`start = min(length, max(0, start))`

			`if stop is None:`
			`stop = length`
			`elif stop < 0:`
			`stop += length`
			`stop = min(length, max(start, stop))`

			`assert 0 <= start <= stop <= length`
			`return start, stop`


* Add util.py 2014-09-25 16:26:22 +00:00			`def utf8open(loc, mode='r'):`
changing deprecated codecs.open to io.open =) 2015-09-30 18:10:15 +00:00			`return io.open(loc, mode, encoding='utf8')`
* Add util.py 2014-09-25 16:26:22 +00:00

access model via sputnik 2015-12-07 05:01:28 +00:00			`def read_lang_data(package):`
* Fix use of mock Package object 2015-12-31 03:13:15 +00:00			`tokenization = package.load_json(('tokenizer', 'specials.json'))`
			`with package.open(('tokenizer', 'prefix.txt'), default=None) as file_:`
			`prefix = read_prefix(file_) if file_ is not None else None`
			`with package.open(('tokenizer', 'suffix.txt'), default=None) as file_:`
			`suffix = read_suffix(file_) if file_ is not None else None`
			`with package.open(('tokenizer', 'infix.txt'), default=None) as file_:`
			`infix = read_infix(file_) if file_ is not None else None`
* Tighten interfaces 2014-10-30 07:14:42 +00:00			`return tokenization, prefix, suffix, infix`
* Add util.py 2014-09-25 16:26:22 +00:00

access model via sputnik 2015-12-07 05:01:28 +00:00			`def read_prefix(fileobj):`
			`entries = fileobj.read().split('\n')`
			`expression = '\|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])`
* Add util.py 2014-09-25 16:26:22 +00:00			`return expression`

* Work on fixing special-cases, reading them in as JSON objects so that they can specify lemmas 2014-12-09 03:48:01 +00:00
access model via sputnik 2015-12-07 05:01:28 +00:00			`def read_suffix(fileobj):`
			`entries = fileobj.read().split('\n')`
			`expression = '\|'.join([piece + '$' for piece in entries if piece.strip()])`
* Add util.py 2014-09-25 16:26:22 +00:00			`return expression`

* Work on fixing special-cases, reading them in as JSON objects so that they can specify lemmas 2014-12-09 03:48:01 +00:00
access model via sputnik 2015-12-07 05:01:28 +00:00			`def read_infix(fileobj):`
			`entries = fileobj.read().split('\n')`
			`expression = '\|'.join([piece for piece in entries if piece.strip()])`
* Add offsets to Tokens class. Some changes to interfaces, and reorganization of spacy.Lang 2014-10-14 04:47:06 +00:00			`return expression`


access model via sputnik 2015-12-07 05:01:28 +00:00			`# def read_tokenization(lang):`
			`# loc = path.join(DATA_DIR, lang, 'tokenization')`
			`# entries = []`
			`# seen = set()`
			`# with utf8open(loc) as file_:`
			`# for line in file_:`
			`# line = line.strip()`
			`# if line.startswith('#'):`
			`# continue`
			`# if not line:`
			`# continue`
			`# pieces = line.split()`
			`# chunk = pieces.pop(0)`
			`# assert chunk not in seen, chunk`
			`# seen.add(chunk)`
			`# entries.append((chunk, list(pieces)))`
			`# if chunk[0].isalpha() and chunk[0].islower():`
			`# chunk = chunk[0].title() + chunk[1:]`
			`# pieces[0] = pieces[0][0].title() + pieces[0][1:]`
			`# seen.add(chunk)`
			`# entries.append((chunk, pieces))`
			`# return entries`


			`# def read_detoken_rules(lang): # Deprecated?`
			`# loc = path.join(DATA_DIR, lang, 'detokenize')`
			`# entries = []`
			`# with utf8open(loc) as file_:`
			`# for line in file_:`
			`# entries.append(line.strip())`
			`# return entries`
Remove trailing whitespace 2015-04-19 08:31:31 +00:00
* Add function to read detokenization rules 2014-10-22 01:54:59 +00:00
* Serialization round trip now working with decent API, but with rough spots in the organisation and requiring vocabulary to be fixed ahead of time. 2015-07-19 13:18:17 +00:00			`def align_tokens(ref, indices): # Deprecated, surely?`
* Add offsets to Tokens class. Some changes to interfaces, and reorganization of spacy.Lang 2014-10-14 04:47:06 +00:00			`start = 0`
			`queue = list(indices)`
			`for token in ref:`
			`end = start + len(token)`
			`emit = []`
			`while queue and queue[0][1] <= end:`
			`emit.append(queue.pop(0))`
			`yield token, emit`
			`start = end`
			`assert not queue`
* Add detokenize method and test 2014-10-18 07:02:05 +00:00

* Serialization round trip now working with decent API, but with rough spots in the organisation and requiring vocabulary to be fixed ahead of time. 2015-07-19 13:18:17 +00:00			`def detokenize(token_rules, words): # Deprecated?`
Remove trailing whitespace 2015-04-19 08:31:31 +00:00			`"""To align with treebanks, return a list of "chunks", where a chunk is a`
* Add detokenize method and test 2014-10-18 07:02:05 +00:00			`sequence of tokens that are separated by whitespace in actual strings. Each`
			`chunk should be a tuple of token indices, e.g.`

			`>>> detokenize(["ca<SEP>n't", '<SEP>!'], ["I", "ca", "n't", "!"])`
			`[(0,), (1, 2, 3)]`
			`"""`
			`string = ' '.join(words)`
			`for subtoks in token_rules:`
			`# Algorithmically this is dumb, but writing a little list-based match`
			`# machine? Ain't nobody got time for that.`
			`string = string.replace(subtoks.replace('<SEP>', ' '), subtoks)`
			`positions = []`
			`i = 0`
			`for chunk in string.split():`
			`subtoks = chunk.split('<SEP>')`
			`positions.append(tuple(range(i, i+len(subtoks))))`
			`i += len(subtoks)`
			`return positions`