Auto-detect package dependencies in spacy package (#8948)

* Auto-detect package dependencies in spacy package

* Add simple get_third_party_dependencies test

* Import packages_distributions explicitly

* Inline packages_distributions

* Fix docstring [ci skip]

* Relax catalogue requirement

* Move importlib_metadata to spacy.compat with note

* Include license information [ci skip]
This commit is contained in:
Ines Montani 2021-08-17 22:05:13 +10:00 committed by GitHub
parent 0a6b68848f
commit d94ddd5686
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 130 additions and 15 deletions

View File

@ -104,3 +104,26 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
importlib_metadata
------------------
* Files: util.py
The implementation of packages_distributions() is adapted from
importlib_metadata, which is distributed under the following license:
Copyright 2017-2019 Jason R. Coombs, Barry Warsaw
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -2,6 +2,8 @@ from typing import Optional, Union, Any, Dict, List, Tuple
import shutil
from pathlib import Path
from wasabi import Printer, MarkdownRenderer, get_raw_input
from thinc.api import Config
from collections import defaultdict
import srsly
import sys
@ -99,6 +101,12 @@ def package(
msg.fail("Can't load pipeline meta.json", meta_path, exits=1)
meta = srsly.read_json(meta_path)
meta = get_meta(input_dir, meta)
if meta["requirements"]:
msg.good(
f"Including {len(meta['requirements'])} package requirement(s) from "
f"meta and config",
", ".join(meta["requirements"]),
)
if name is not None:
meta["name"] = name
if version is not None:
@ -175,6 +183,51 @@ def has_wheel() -> bool:
return False
def get_third_party_dependencies(
config: Config, exclude: List[str] = util.SimpleFrozenList()
) -> List[str]:
"""If the config includes references to registered functions that are
provided by third-party packages (spacy-transformers, other libraries), we
want to include them in meta["requirements"] so that the package specifies
them as dependencies and the user won't have to do it manually.
We do this by:
- traversing the config to check for registered function (@ keys)
- looking up the functions and getting their module
- looking up the module version and generating an appropriate version range
config (Config): The pipeline config.
exclude (list): List of packages to exclude (e.g. that already exist in meta).
RETURNS (list): The versioned requirements.
"""
own_packages = ("spacy", "spacy-nightly", "thinc", "srsly")
distributions = util.packages_distributions()
funcs = defaultdict(set)
for path, value in util.walk_dict(config):
if path[-1].startswith("@"): # collect all function references by registry
funcs[path[-1][1:]].add(value)
modules = set()
for reg_name, func_names in funcs.items():
sub_registry = getattr(util.registry, reg_name)
for func_name in func_names:
func_info = sub_registry.find(func_name)
module_name = func_info.get("module")
if module_name: # the code is part of a module, not a --code file
modules.add(func_info["module"].split(".")[0])
dependencies = []
for module_name in modules:
if module_name in distributions:
dist = distributions.get(module_name)
if dist:
pkg = dist[0]
if pkg in own_packages or pkg in exclude:
continue
version = util.get_package_version(pkg)
version_range = util.get_minor_version_range(version)
dependencies.append(f"{pkg}{version_range}")
return dependencies
def get_build_formats(formats: List[str]) -> Tuple[bool, bool]:
supported = ["sdist", "wheel", "none"]
for form in formats:
@ -208,7 +261,7 @@ def get_meta(
nlp = util.load_model_from_path(Path(model_path))
meta.update(nlp.meta)
meta.update(existing_meta)
meta["spacy_version"] = util.get_model_version_range(about.__version__)
meta["spacy_version"] = util.get_minor_version_range(about.__version__)
meta["vectors"] = {
"width": nlp.vocab.vectors_length,
"vectors": len(nlp.vocab.vectors),
@ -217,6 +270,11 @@ def get_meta(
}
if about.__title__ != "spacy":
meta["parent_package"] = about.__title__
meta.setdefault("requirements", [])
# Update the requirements with all third-party packages in the config
existing_reqs = [util.split_requirement(req)[0] for req in meta["requirements"]]
reqs = get_third_party_dependencies(nlp.config, exclude=existing_reqs)
meta["requirements"].extend(reqs)
return meta

View File

@ -27,6 +27,14 @@ try: # Python 3.8+
except ImportError:
from typing_extensions import Literal # noqa: F401
# Important note: The importlib_metadata "backport" includes functionality
# that's not part of the built-in importlib.metadata. We should treat this
# import like the built-in and only use what's available there.
try: # Python 3.8+
import importlib.metadata as importlib_metadata
except ImportError:
from catalogue import _importlib_metadata as importlib_metadata # noqa: F401
from thinc.api import Optimizer # noqa: F401
pickle = pickle

View File

@ -199,7 +199,7 @@ class Language:
DOCS: https://spacy.io/api/language#meta
"""
spacy_version = util.get_model_version_range(about.__version__)
spacy_version = util.get_minor_version_range(about.__version__)
if self.vocab.lang:
self._meta.setdefault("lang", self.vocab.lang)
else:

View File

@ -14,6 +14,7 @@ from spacy import about
from spacy.util import get_minor_version
from spacy.cli.validate import get_model_pkgs
from spacy.cli.download import get_compatibility, get_version
from spacy.cli.package import get_third_party_dependencies
from thinc.api import ConfigValidationError, Config
import srsly
import os
@ -532,3 +533,10 @@ def test_init_labels(component_name):
assert len(nlp2.get_pipe(component_name).labels) == 0
nlp2.initialize()
assert len(nlp2.get_pipe(component_name).labels) == 4
def test_get_third_party_dependencies_runs():
# We can't easily test the detection of third-party packages here, but we
# can at least make sure that the function and its importlib magic runs.
nlp = Dutch()
assert get_third_party_dependencies(nlp.config) == []

View File

@ -20,8 +20,10 @@ import sys
import warnings
from packaging.specifiers import SpecifierSet, InvalidSpecifier
from packaging.version import Version, InvalidVersion
from packaging.requirements import Requirement
import subprocess
from contextlib import contextmanager
from collections import defaultdict
import tempfile
import shutil
import shlex
@ -33,11 +35,6 @@ try:
except ImportError:
cupy = None
try: # Python 3.8
import importlib.metadata as importlib_metadata
except ImportError:
from catalogue import _importlib_metadata as importlib_metadata
# These are functions that were previously (v2.x) available from spacy.util
# and have since moved to Thinc. We're importing them here so people's code
# doesn't break, but they should always be imported from Thinc from now on,
@ -46,7 +43,7 @@ from thinc.api import fix_random_seed, compounding, decaying # noqa: F401
from .symbols import ORTH
from .compat import cupy, CudaStream, is_windows
from .compat import cupy, CudaStream, is_windows, importlib_metadata
from .errors import Errors, Warnings, OLD_MODEL_SHORTCUTS
from . import about
@ -639,13 +636,18 @@ def is_unconstrained_version(
return True
def get_model_version_range(spacy_version: str) -> str:
"""Generate a version range like >=1.2.3,<1.3.0 based on a given spaCy
version. Models are always compatible across patch versions but not
across minor or major versions.
def split_requirement(requirement: str) -> Tuple[str, str]:
"""Split a requirement like spacy>=1.2.3 into ("spacy", ">=1.2.3")."""
req = Requirement(requirement)
return (req.name, str(req.specifier))
def get_minor_version_range(version: str) -> str:
"""Generate a version range like >=1.2.3,<1.3.0 based on a given version
(e.g. of spaCy).
"""
release = Version(spacy_version).release
return f">={spacy_version},<{release[0]}.{release[1] + 1}.0"
release = Version(version).release
return f">={version},<{release[0]}.{release[1] + 1}.0"
def get_model_lower_version(constraint: str) -> Optional[str]:
@ -733,7 +735,7 @@ def load_meta(path: Union[str, Path]) -> Dict[str, Any]:
model=f"{meta['lang']}_{meta['name']}",
model_version=meta["version"],
version=meta["spacy_version"],
example=get_model_version_range(about.__version__),
example=get_minor_version_range(about.__version__),
)
warnings.warn(warn_msg)
return meta
@ -1549,3 +1551,19 @@ def to_ternary_int(val) -> int:
return 0
else:
return -1
# The following implementation of packages_distributions() is adapted from
# importlib_metadata, which is distributed under the Apache 2.0 License.
# Copyright (c) 2017-2019 Jason R. Coombs, Barry Warsaw
# See licenses/3rd_party_licenses.txt
def packages_distributions() -> Dict[str, List[str]]:
"""Return a mapping of top-level packages to their distributions. We're
inlining this helper from the importlib_metadata "backport" here, since
it's not available in the builtin importlib.metadata.
"""
pkg_to_dist = defaultdict(list)
for dist in importlib_metadata.distributions():
for pkg in (dist.read_text("top_level.txt") or "").split():
pkg_to_dist[pkg].append(dist.metadata["Name"])
return dict(pkg_to_dist)