mirror of https://github.com/explosion/spaCy.git
integrate with sputnik
This commit is contained in:
parent
aa51014f71
commit
bc229790ac
|
@ -10,4 +10,4 @@ plac
|
||||||
six
|
six
|
||||||
ujson
|
ujson
|
||||||
cloudpickle
|
cloudpickle
|
||||||
sputnik>=0.6.4,<0.7.0
|
sputnik>=0.7.0,<0.8.0
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -271,7 +271,7 @@ def setup_package():
|
||||||
ext_modules=ext_modules,
|
ext_modules=ext_modules,
|
||||||
install_requires=['numpy', 'murmurhash>=0.26,<0.27', 'cymem>=1.30,<1.31', 'preshed>=0.46.1,<0.47',
|
install_requires=['numpy', 'murmurhash>=0.26,<0.27', 'cymem>=1.30,<1.31', 'preshed>=0.46.1,<0.47',
|
||||||
'thinc>=4.2.0,<4.3.0', 'text_unidecode', 'plac', 'six',
|
'thinc>=4.2.0,<4.3.0', 'text_unidecode', 'plac', 'six',
|
||||||
'ujson', 'cloudpickle', 'sputnik>=0.6.4,<0.7.0'],
|
'ujson', 'cloudpickle', 'sputnik>=0.7.0,<0.8.0'],
|
||||||
cmdclass = {
|
cmdclass = {
|
||||||
'build_ext': build_ext_subclass},
|
'build_ext': build_ext_subclass},
|
||||||
)
|
)
|
||||||
|
|
|
@ -20,7 +20,7 @@ from .syntax.ner import BiluoPushDown
|
||||||
from .syntax.arc_eager import ArcEager
|
from .syntax.arc_eager import ArcEager
|
||||||
|
|
||||||
from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD
|
from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD
|
||||||
from .util import get_package, Package
|
from .util import get_package
|
||||||
|
|
||||||
|
|
||||||
class Language(object):
|
class Language(object):
|
||||||
|
@ -146,13 +146,13 @@ class Language(object):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def default_parser(cls, package, vocab):
|
def default_parser(cls, package, vocab):
|
||||||
data_dir = package.dir_path('deps', require=False)
|
data_dir = package.dir_path('deps')
|
||||||
if data_dir and path.exists(data_dir):
|
if data_dir and path.exists(data_dir):
|
||||||
return Parser.from_dir(data_dir, vocab.strings, ArcEager)
|
return Parser.from_dir(data_dir, vocab.strings, ArcEager)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def default_entity(cls, package, vocab):
|
def default_entity(cls, package, vocab):
|
||||||
data_dir = package.dir_path('ner', require=False)
|
data_dir = package.dir_path('ner')
|
||||||
if data_dir and path.exists(data_dir):
|
if data_dir and path.exists(data_dir):
|
||||||
return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown)
|
return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown)
|
||||||
|
|
||||||
|
@ -182,7 +182,7 @@ class Language(object):
|
||||||
- Language(model='en_default ==1.0.0')
|
- Language(model='en_default ==1.0.0')
|
||||||
- Language(model='en_default <1.1.0, data_dir='spacy/data')
|
- Language(model='en_default <1.1.0, data_dir='spacy/data')
|
||||||
"""
|
"""
|
||||||
package = Package(data_dir)
|
package = get_package(model, data_path=data_dir)
|
||||||
if load_vectors is not True:
|
if load_vectors is not True:
|
||||||
warn("load_vectors is deprecated", DeprecationWarning)
|
warn("load_vectors is deprecated", DeprecationWarning)
|
||||||
if vocab in (None, True):
|
if vocab in (None, True):
|
||||||
|
|
|
@ -8,13 +8,13 @@ except ImportError:
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from .parts_of_speech import NOUN, VERB, ADJ, PUNCT
|
from .parts_of_speech import NOUN, VERB, ADJ, PUNCT
|
||||||
from .util import Package
|
from .util import get_package
|
||||||
|
|
||||||
|
|
||||||
class Lemmatizer(object):
|
class Lemmatizer(object):
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, pkg_or_str_or_file):
|
def load(cls, pkg_or_str_or_file):
|
||||||
pkg = Package.create_or_return(pkg_or_str_or_file)
|
pkg = get_package(pkg_or_str_or_file)
|
||||||
index = {}
|
index = {}
|
||||||
exc = {}
|
exc = {}
|
||||||
for pos in ['adj', 'noun', 'verb']:
|
for pos in ['adj', 'noun', 'verb']:
|
||||||
|
|
|
@ -21,7 +21,7 @@ from .tokens.doc cimport Doc
|
||||||
from .vocab cimport Vocab
|
from .vocab cimport Vocab
|
||||||
|
|
||||||
from .attrs import FLAG61 as U_ENT
|
from .attrs import FLAG61 as U_ENT
|
||||||
from .util import Package
|
from .util import get_package
|
||||||
|
|
||||||
from .attrs import FLAG60 as B2_ENT
|
from .attrs import FLAG60 as B2_ENT
|
||||||
from .attrs import FLAG59 as B3_ENT
|
from .attrs import FLAG59 as B3_ENT
|
||||||
|
@ -171,7 +171,7 @@ cdef class Matcher:
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, pkg_or_str_or_file, Vocab vocab):
|
def load(cls, pkg_or_str_or_file, Vocab vocab):
|
||||||
package = Package.create_or_return(pkg_or_str_or_file)
|
package = get_package(pkg_or_str_or_file)
|
||||||
patterns = package.load_json(('vocab', 'gazetteer.json'))
|
patterns = package.load_json(('vocab', 'gazetteer.json'))
|
||||||
return cls(vocab, patterns)
|
return cls(vocab, patterns)
|
||||||
|
|
||||||
|
|
|
@ -16,7 +16,7 @@ from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
|
||||||
|
|
||||||
from .attrs cimport *
|
from .attrs cimport *
|
||||||
|
|
||||||
from .util import Package
|
from .util import get_package
|
||||||
|
|
||||||
|
|
||||||
cpdef enum:
|
cpdef enum:
|
||||||
|
@ -149,7 +149,7 @@ cdef class Tagger:
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, pkg_or_str_or_file, vocab):
|
def load(cls, pkg_or_str_or_file, vocab):
|
||||||
pkg = Package.create_or_return(pkg_or_str_or_file)
|
pkg = get_package(pkg_or_str_or_file)
|
||||||
# TODO: templates.json deprecated? not present in latest package
|
# TODO: templates.json deprecated? not present in latest package
|
||||||
templates = cls.default_templates()
|
templates = cls.default_templates()
|
||||||
# templates = package.load_utf8(json.load,
|
# templates = package.load_utf8(json.load,
|
||||||
|
|
|
@ -5,7 +5,7 @@ import io
|
||||||
import pickle
|
import pickle
|
||||||
|
|
||||||
from spacy.lemmatizer import Lemmatizer, read_index, read_exc
|
from spacy.lemmatizer import Lemmatizer, read_index, read_exc
|
||||||
from spacy.util import get_package, Package
|
from spacy.util import get_package
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
|
@ -17,7 +17,7 @@ cimport cython
|
||||||
from . import util
|
from . import util
|
||||||
from .tokens.doc cimport Doc
|
from .tokens.doc cimport Doc
|
||||||
from .util import read_lang_data
|
from .util import read_lang_data
|
||||||
from .util import Package
|
from .util import get_package
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokenizer:
|
cdef class Tokenizer:
|
||||||
|
@ -43,7 +43,7 @@ cdef class Tokenizer:
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, pkg_or_str_or_file, Vocab vocab):
|
def load(cls, pkg_or_str_or_file, Vocab vocab):
|
||||||
pkg = Package.create_or_return(pkg_or_str_or_file)
|
pkg = get_package(pkg_or_str_or_file)
|
||||||
rules, prefix_re, suffix_re, infix_re = read_lang_data(pkg)
|
rules, prefix_re, suffix_re, infix_re = read_lang_data(pkg)
|
||||||
prefix_re = re.compile(prefix_re)
|
prefix_re = re.compile(prefix_re)
|
||||||
suffix_re = re.compile(suffix_re)
|
suffix_re = re.compile(suffix_re)
|
||||||
|
|
|
@ -3,76 +3,27 @@ import io
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
import os.path
|
import os.path
|
||||||
from contextlib import contextmanager
|
|
||||||
import types
|
|
||||||
|
|
||||||
|
import sputnik
|
||||||
|
from sputnik.dir_package import DirPackage
|
||||||
|
from sputnik.package_stub import PackageStub
|
||||||
|
|
||||||
|
from . import about
|
||||||
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
||||||
|
|
||||||
|
|
||||||
def local_path(*dirs):
|
def get_package(value=None, data_path=None):
|
||||||
return os.path.abspath(os.path.join(os.path.dirname(__file__), *dirs))
|
if data_path is None:
|
||||||
|
if isinstance(value, PackageStub):
|
||||||
|
return value
|
||||||
|
elif value and os.path.isdir(value):
|
||||||
|
return DirPackage(value)
|
||||||
|
|
||||||
|
elif value is None and data_path is not None:
|
||||||
|
return DirPackage(data_path)
|
||||||
|
|
||||||
class Package(object):
|
return sputnik.package('spacy', about.short_version,
|
||||||
@classmethod
|
value or 'en_default==1.0.4', data_path=data_path)
|
||||||
def create_or_return(cls, me_or_arg):
|
|
||||||
return me_or_arg if isinstance(me_or_arg, cls) else cls(me_or_arg)
|
|
||||||
|
|
||||||
def __init__(self, data_path=None, model='en_default-1.0.3'):
|
|
||||||
if data_path is None:
|
|
||||||
data_path = local_path('data', model)
|
|
||||||
self.model = model
|
|
||||||
self.data_path = data_path
|
|
||||||
self._root = self.data_path
|
|
||||||
|
|
||||||
def get(self, key):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def has_file(self, *path_parts):
|
|
||||||
return os.path.exists(os.path.join(self._root, *path_parts))
|
|
||||||
|
|
||||||
def file_path(self, *path_parts, **kwargs):
|
|
||||||
return os.path.join(self._root, *path_parts)
|
|
||||||
|
|
||||||
def dir_path(self, *path_parts, **kwargs):
|
|
||||||
return os.path.join(self._root, *path_parts)
|
|
||||||
|
|
||||||
def load_json(self, path_parts, default=None):
|
|
||||||
if not self.has_file(*path_parts):
|
|
||||||
if _is_error_class(default):
|
|
||||||
raise default(self.file_path(*path_parts))
|
|
||||||
elif isinstance(default, Exception):
|
|
||||||
raise default
|
|
||||||
else:
|
|
||||||
return default
|
|
||||||
with io.open(self.file_path(os.path.join(*path_parts)),
|
|
||||||
mode='r', encoding='utf8') as file_:
|
|
||||||
return json.load(file_)
|
|
||||||
|
|
||||||
@contextmanager
|
|
||||||
def open(self, path_parts, mode='r', encoding='utf8', default=IOError):
|
|
||||||
if not self.has_file(*path_parts):
|
|
||||||
if _is_error_class(default):
|
|
||||||
raise default(self.file_path(*path_parts))
|
|
||||||
elif isinstance(default, Exception):
|
|
||||||
raise default
|
|
||||||
else:
|
|
||||||
yield default
|
|
||||||
else:
|
|
||||||
# Enter
|
|
||||||
file_ = io.open(self.file_path(os.path.join(*path_parts)),
|
|
||||||
mode=mode, encoding='utf8')
|
|
||||||
yield file_
|
|
||||||
# Exit
|
|
||||||
file_.close()
|
|
||||||
|
|
||||||
|
|
||||||
def _is_error_class(e):
|
|
||||||
return isinstance(e, types.TypeType) and issubclass(e, Exception)
|
|
||||||
|
|
||||||
|
|
||||||
def get_package(name=None, data_path=None):
|
|
||||||
return Package(data_path)
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_slice(length, start, stop, step=None):
|
def normalize_slice(length, start, stop, step=None):
|
||||||
|
|
|
@ -19,7 +19,7 @@ from .orth cimport word_shape
|
||||||
from .typedefs cimport attr_t
|
from .typedefs cimport attr_t
|
||||||
from .cfile cimport CFile
|
from .cfile cimport CFile
|
||||||
from .lemmatizer import Lemmatizer
|
from .lemmatizer import Lemmatizer
|
||||||
from .util import Package
|
from .util import get_package
|
||||||
|
|
||||||
from . import attrs
|
from . import attrs
|
||||||
from . import symbols
|
from . import symbols
|
||||||
|
@ -49,7 +49,7 @@ cdef class Vocab:
|
||||||
'''
|
'''
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, pkg_or_str_or_file, get_lex_attr=None):
|
def load(cls, pkg_or_str_or_file, get_lex_attr=None):
|
||||||
package = Package.create_or_return(pkg_or_str_or_file)
|
package = get_package(pkg_or_str_or_file)
|
||||||
tag_map = package.load_json(('vocab', 'tag_map.json'), default={})
|
tag_map = package.load_json(('vocab', 'tag_map.json'), default={})
|
||||||
|
|
||||||
lemmatizer = Lemmatizer.load(package)
|
lemmatizer = Lemmatizer.load(package)
|
||||||
|
|
Loading…
Reference in New Issue