From d94ddd568691cfa9d56b353173237afe1f328b43 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 17 Aug 2021 22:05:13 +1000 Subject: [PATCH] Auto-detect package dependencies in spacy package (#8948) * Auto-detect package dependencies in spacy package * Add simple get_third_party_dependencies test * Import packages_distributions explicitly * Inline packages_distributions * Fix docstring [ci skip] * Relax catalogue requirement * Move importlib_metadata to spacy.compat with note * Include license information [ci skip] --- licenses/3rd_party_licenses.txt | 23 +++++++++++++ spacy/cli/package.py | 60 ++++++++++++++++++++++++++++++++- spacy/compat.py | 8 +++++ spacy/language.py | 2 +- spacy/tests/test_cli.py | 8 +++++ spacy/util.py | 44 +++++++++++++++++------- 6 files changed, 130 insertions(+), 15 deletions(-) diff --git a/licenses/3rd_party_licenses.txt b/licenses/3rd_party_licenses.txt index 7bc3d4547..d58da9c4a 100644 --- a/licenses/3rd_party_licenses.txt +++ b/licenses/3rd_party_licenses.txt @@ -104,3 +104,26 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +importlib_metadata +------------------ + +* Files: util.py + +The implementation of packages_distributions() is adapted from +importlib_metadata, which is distributed under the following license: + +Copyright 2017-2019 Jason R. Coombs, Barry Warsaw + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 342baa8ab..b6b993267 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -2,6 +2,8 @@ from typing import Optional, Union, Any, Dict, List, Tuple import shutil from pathlib import Path from wasabi import Printer, MarkdownRenderer, get_raw_input +from thinc.api import Config +from collections import defaultdict import srsly import sys @@ -99,6 +101,12 @@ def package( msg.fail("Can't load pipeline meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) meta = get_meta(input_dir, meta) + if meta["requirements"]: + msg.good( + f"Including {len(meta['requirements'])} package requirement(s) from " + f"meta and config", + ", ".join(meta["requirements"]), + ) if name is not None: meta["name"] = name if version is not None: @@ -175,6 +183,51 @@ def has_wheel() -> bool: return False +def get_third_party_dependencies( + config: Config, exclude: List[str] = util.SimpleFrozenList() +) -> List[str]: + """If the config includes references to registered functions that are + provided by third-party packages (spacy-transformers, other libraries), we + want to include them in meta["requirements"] so that the package specifies + them as dependencies and the user won't have to do it manually. + + We do this by: + - traversing the config to check for registered function (@ keys) + - looking up the functions and getting their module + - looking up the module version and generating an appropriate version range + + config (Config): The pipeline config. + exclude (list): List of packages to exclude (e.g. that already exist in meta). + RETURNS (list): The versioned requirements. + """ + own_packages = ("spacy", "spacy-nightly", "thinc", "srsly") + distributions = util.packages_distributions() + funcs = defaultdict(set) + for path, value in util.walk_dict(config): + if path[-1].startswith("@"): # collect all function references by registry + funcs[path[-1][1:]].add(value) + modules = set() + for reg_name, func_names in funcs.items(): + sub_registry = getattr(util.registry, reg_name) + for func_name in func_names: + func_info = sub_registry.find(func_name) + module_name = func_info.get("module") + if module_name: # the code is part of a module, not a --code file + modules.add(func_info["module"].split(".")[0]) + dependencies = [] + for module_name in modules: + if module_name in distributions: + dist = distributions.get(module_name) + if dist: + pkg = dist[0] + if pkg in own_packages or pkg in exclude: + continue + version = util.get_package_version(pkg) + version_range = util.get_minor_version_range(version) + dependencies.append(f"{pkg}{version_range}") + return dependencies + + def get_build_formats(formats: List[str]) -> Tuple[bool, bool]: supported = ["sdist", "wheel", "none"] for form in formats: @@ -208,7 +261,7 @@ def get_meta( nlp = util.load_model_from_path(Path(model_path)) meta.update(nlp.meta) meta.update(existing_meta) - meta["spacy_version"] = util.get_model_version_range(about.__version__) + meta["spacy_version"] = util.get_minor_version_range(about.__version__) meta["vectors"] = { "width": nlp.vocab.vectors_length, "vectors": len(nlp.vocab.vectors), @@ -217,6 +270,11 @@ def get_meta( } if about.__title__ != "spacy": meta["parent_package"] = about.__title__ + meta.setdefault("requirements", []) + # Update the requirements with all third-party packages in the config + existing_reqs = [util.split_requirement(req)[0] for req in meta["requirements"]] + reqs = get_third_party_dependencies(nlp.config, exclude=existing_reqs) + meta["requirements"].extend(reqs) return meta diff --git a/spacy/compat.py b/spacy/compat.py index 6eca18b80..92ed23c0e 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -27,6 +27,14 @@ try: # Python 3.8+ except ImportError: from typing_extensions import Literal # noqa: F401 +# Important note: The importlib_metadata "backport" includes functionality +# that's not part of the built-in importlib.metadata. We should treat this +# import like the built-in and only use what's available there. +try: # Python 3.8+ + import importlib.metadata as importlib_metadata +except ImportError: + from catalogue import _importlib_metadata as importlib_metadata # noqa: F401 + from thinc.api import Optimizer # noqa: F401 pickle = pickle diff --git a/spacy/language.py b/spacy/language.py index a8cad1259..99d55df81 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -199,7 +199,7 @@ class Language: DOCS: https://spacy.io/api/language#meta """ - spacy_version = util.get_model_version_range(about.__version__) + spacy_version = util.get_minor_version_range(about.__version__) if self.vocab.lang: self._meta.setdefault("lang", self.vocab.lang) else: diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 6f0fdcfa5..1841de317 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -14,6 +14,7 @@ from spacy import about from spacy.util import get_minor_version from spacy.cli.validate import get_model_pkgs from spacy.cli.download import get_compatibility, get_version +from spacy.cli.package import get_third_party_dependencies from thinc.api import ConfigValidationError, Config import srsly import os @@ -532,3 +533,10 @@ def test_init_labels(component_name): assert len(nlp2.get_pipe(component_name).labels) == 0 nlp2.initialize() assert len(nlp2.get_pipe(component_name).labels) == 4 + + +def test_get_third_party_dependencies_runs(): + # We can't easily test the detection of third-party packages here, but we + # can at least make sure that the function and its importlib magic runs. + nlp = Dutch() + assert get_third_party_dependencies(nlp.config) == [] diff --git a/spacy/util.py b/spacy/util.py index 421287ce2..6638e94ce 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -20,8 +20,10 @@ import sys import warnings from packaging.specifiers import SpecifierSet, InvalidSpecifier from packaging.version import Version, InvalidVersion +from packaging.requirements import Requirement import subprocess from contextlib import contextmanager +from collections import defaultdict import tempfile import shutil import shlex @@ -33,11 +35,6 @@ try: except ImportError: cupy = None -try: # Python 3.8 - import importlib.metadata as importlib_metadata -except ImportError: - from catalogue import _importlib_metadata as importlib_metadata - # These are functions that were previously (v2.x) available from spacy.util # and have since moved to Thinc. We're importing them here so people's code # doesn't break, but they should always be imported from Thinc from now on, @@ -46,7 +43,7 @@ from thinc.api import fix_random_seed, compounding, decaying # noqa: F401 from .symbols import ORTH -from .compat import cupy, CudaStream, is_windows +from .compat import cupy, CudaStream, is_windows, importlib_metadata from .errors import Errors, Warnings, OLD_MODEL_SHORTCUTS from . import about @@ -639,13 +636,18 @@ def is_unconstrained_version( return True -def get_model_version_range(spacy_version: str) -> str: - """Generate a version range like >=1.2.3,<1.3.0 based on a given spaCy - version. Models are always compatible across patch versions but not - across minor or major versions. +def split_requirement(requirement: str) -> Tuple[str, str]: + """Split a requirement like spacy>=1.2.3 into ("spacy", ">=1.2.3").""" + req = Requirement(requirement) + return (req.name, str(req.specifier)) + + +def get_minor_version_range(version: str) -> str: + """Generate a version range like >=1.2.3,<1.3.0 based on a given version + (e.g. of spaCy). """ - release = Version(spacy_version).release - return f">={spacy_version},<{release[0]}.{release[1] + 1}.0" + release = Version(version).release + return f">={version},<{release[0]}.{release[1] + 1}.0" def get_model_lower_version(constraint: str) -> Optional[str]: @@ -733,7 +735,7 @@ def load_meta(path: Union[str, Path]) -> Dict[str, Any]: model=f"{meta['lang']}_{meta['name']}", model_version=meta["version"], version=meta["spacy_version"], - example=get_model_version_range(about.__version__), + example=get_minor_version_range(about.__version__), ) warnings.warn(warn_msg) return meta @@ -1549,3 +1551,19 @@ def to_ternary_int(val) -> int: return 0 else: return -1 + + +# The following implementation of packages_distributions() is adapted from +# importlib_metadata, which is distributed under the Apache 2.0 License. +# Copyright (c) 2017-2019 Jason R. Coombs, Barry Warsaw +# See licenses/3rd_party_licenses.txt +def packages_distributions() -> Dict[str, List[str]]: + """Return a mapping of top-level packages to their distributions. We're + inlining this helper from the importlib_metadata "backport" here, since + it's not available in the builtin importlib.metadata. + """ + pkg_to_dist = defaultdict(list) + for dist in importlib_metadata.distributions(): + for pkg in (dist.read_text("top_level.txt") or "").split(): + pkg_to_dist[pkg].append(dist.metadata["Name"]) + return dict(pkg_to_dist)