mirror of https://github.com/explosion/spaCy.git
144 lines
3.7 KiB
Python
144 lines
3.7 KiB
Python
# encoding: utf8
|
|
from __future__ import unicode_literals
|
|
import os
|
|
import io
|
|
import json
|
|
import re
|
|
import os.path
|
|
import pathlib
|
|
|
|
import six
|
|
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
|
|
|
try:
|
|
basestring
|
|
except NameError:
|
|
basestring = str
|
|
|
|
|
|
LANGUAGES = {}
|
|
_data_path = pathlib.Path(__file__).parent / 'data'
|
|
|
|
|
|
def set_lang_class(name, cls):
|
|
global LANGUAGES
|
|
LANGUAGES[name] = cls
|
|
|
|
|
|
def get_lang_class(name):
|
|
lang = re.split('[^a-zA-Z0-9]', name, 1)[0]
|
|
if lang not in LANGUAGES:
|
|
raise RuntimeError('Language not supported: %s' % lang)
|
|
return LANGUAGES[lang]
|
|
|
|
|
|
def get_data_path():
|
|
return _data_path
|
|
|
|
|
|
def set_data_path(path):
|
|
global _data_path
|
|
if isinstance(path, basestring):
|
|
path = pathlib.Path(path)
|
|
_data_path = path
|
|
|
|
|
|
def or_(val1, val2):
|
|
if val1 is not None:
|
|
return val1
|
|
elif callable(val2):
|
|
return val2()
|
|
else:
|
|
return val2
|
|
|
|
|
|
def match_best_version(target_name, target_version, path):
|
|
path = path if not isinstance(path, basestring) else pathlib.Path(path)
|
|
if not path.exists():
|
|
return None
|
|
matches = []
|
|
for data_name in path.iterdir():
|
|
name, version = split_data_name(data_name.parts[-1])
|
|
if name == target_name and constraint_match(target_version, version):
|
|
matches.append((tuple(float(v) for v in version.split('.')), data_name))
|
|
if matches:
|
|
return pathlib.Path(max(matches)[1])
|
|
else:
|
|
return None
|
|
|
|
|
|
def split_data_name(name):
|
|
return name.split('-', 1) if '-' in name else (name, '')
|
|
|
|
|
|
def constraint_match(constraint_string, version):
|
|
# From http://github.com/spacy-io/sputnik
|
|
if not constraint_string:
|
|
return True
|
|
|
|
constraints = [c.strip() for c in constraint_string.split(',') if c.strip()]
|
|
|
|
for c in constraints:
|
|
if not re.match(r'[><=][=]?\d+(\.\d+)*', c):
|
|
raise ValueError('invalid constraint: %s' % c)
|
|
|
|
return all(semver.match(version, c) for c in constraints)
|
|
|
|
|
|
def read_regex(path):
|
|
path = path if not isinstance(path, basestring) else pathlib.Path(path)
|
|
with path.open() as file_:
|
|
entries = file_.read().split('\n')
|
|
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
|
|
return re.compile(expression)
|
|
|
|
|
|
def compile_prefix_regex(entries):
|
|
if '(' in entries:
|
|
# Handle deprecated data
|
|
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
|
|
return re.compile(expression)
|
|
else:
|
|
expression = '|'.join(['^' + piece for piece in entries if piece.strip()])
|
|
return re.compile(expression)
|
|
|
|
|
|
def compile_suffix_regex(entries):
|
|
expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
|
|
return re.compile(expression)
|
|
|
|
|
|
def compile_infix_regex(entries):
|
|
expression = '|'.join([piece for piece in entries if piece.strip()])
|
|
return re.compile(expression)
|
|
|
|
|
|
def normalize_slice(length, start, stop, step=None):
|
|
if not (step is None or step == 1):
|
|
raise ValueError("Stepped slices not supported in Span objects."
|
|
"Try: list(tokens)[start:stop:step] instead.")
|
|
if start is None:
|
|
start = 0
|
|
elif start < 0:
|
|
start += length
|
|
start = min(length, max(0, start))
|
|
|
|
if stop is None:
|
|
stop = length
|
|
elif stop < 0:
|
|
stop += length
|
|
stop = min(length, max(start, stop))
|
|
|
|
assert 0 <= start <= stop <= length
|
|
return start, stop
|
|
|
|
|
|
def utf8open(loc, mode='r'):
|
|
return io.open(loc, mode, encoding='utf8')
|
|
|
|
|
|
def check_renamed_kwargs(renamed, kwargs):
|
|
for old, new in renamed.items():
|
|
if old in kwargs:
|
|
raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
|