Raise error in spacy package when model name is not a valid python identifier (#10192)

* MultiHashEmbed vector docs correction

* raise error for invalid identifier as model name

* more succinct error message

* update success message

* permitted package name + double underscore

* clarify package name error

* clarify underscore run message

* tweak language + simplify underscore run

* cleanup underscore run warning

* spacing correction

* Update spacy/tests/test_cli.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
Peter Baumgartner 2022-02-10 02:15:23 -05:00 committed by GitHub
parent 3877f78ff9
commit ee662ec381
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 48 additions and 3 deletions

View File

@ -7,6 +7,7 @@ from collections import defaultdict
from catalogue import RegistryError
import srsly
import sys
import re
from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX
from ..schemas import validate, ModelMetaSchema
@ -109,6 +110,24 @@ def package(
", ".join(meta["requirements"]),
)
if name is not None:
if not name.isidentifier():
msg.fail(
f"Model name ('{name}') is not a valid module name. "
"This is required so it can be imported as a module.",
"We recommend names that use ASCII A-Z, a-z, _ (underscore), "
"and 0-9. "
"For specific details see: https://docs.python.org/3/reference/lexical_analysis.html#identifiers",
exits=1,
)
if not _is_permitted_package_name(name):
msg.fail(
f"Model name ('{name}') is not a permitted package name. "
"This is required to correctly load the model with spacy.load.",
"We recommend names that use ASCII A-Z, a-z, _ (underscore), "
"and 0-9. "
"For specific details see: https://www.python.org/dev/peps/pep-0426/#name",
exits=1,
)
meta["name"] = name
if version is not None:
meta["version"] = version
@ -162,7 +181,7 @@ def package(
imports="\n".join(f"from . import {m}" for m in imports)
)
create_file(package_path / "__init__.py", init_py)
msg.good(f"Successfully created package '{model_name_v}'", main_path)
msg.good(f"Successfully created package directory '{model_name_v}'", main_path)
if create_sdist:
with util.working_dir(main_path):
util.run_command([sys.executable, "setup.py", "sdist"], capture=False)
@ -171,8 +190,14 @@ def package(
if create_wheel:
with util.working_dir(main_path):
util.run_command([sys.executable, "setup.py", "bdist_wheel"], capture=False)
wheel = main_path / "dist" / f"{model_name_v}{WHEEL_SUFFIX}"
wheel_name_squashed = re.sub("_+", "_", model_name_v)
wheel = main_path / "dist" / f"{wheel_name_squashed}{WHEEL_SUFFIX}"
msg.good(f"Successfully created binary wheel", wheel)
if "__" in model_name:
msg.warn(
f"Model name ('{model_name}') contains a run of underscores. "
"Runs of underscores are not significant in installed package names.",
)
def has_wheel() -> bool:
@ -422,6 +447,14 @@ def _format_label_scheme(data: Dict[str, Any]) -> str:
return md.text
def _is_permitted_package_name(package_name: str) -> bool:
# regex from: https://www.python.org/dev/peps/pep-0426/#name
permitted_match = re.search(
r"^([A-Z0-9]|[A-Z0-9][A-Z0-9._-]*[A-Z0-9])$", package_name, re.IGNORECASE
)
return permitted_match is not None
TEMPLATE_SETUP = """
#!/usr/bin/env python
import io

View File

@ -17,6 +17,7 @@ from spacy.cli.debug_data import _get_labels_from_spancat
from spacy.cli.download import get_compatibility, get_version
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
from spacy.cli.package import get_third_party_dependencies
from spacy.cli.package import _is_permitted_package_name
from spacy.cli.validate import get_model_pkgs
from spacy.lang.en import English
from spacy.lang.nl import Dutch
@ -695,6 +696,17 @@ def test_get_labels_from_model(factory_name, pipe_name):
assert _get_labels_from_model(nlp, factory_name) == set(labels)
def test_permitted_package_names():
# https://www.python.org/dev/peps/pep-0426/#name
assert _is_permitted_package_name("Meine_Bäume") == False
assert _is_permitted_package_name("_package") == False
assert _is_permitted_package_name("package_") == False
assert _is_permitted_package_name(".package") == False
assert _is_permitted_package_name("package.") == False
assert _is_permitted_package_name("-package") == False
assert _is_permitted_package_name("package-") == False
def test_debug_data_compile_gold():
nlp = English()
pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"])
@ -707,4 +719,4 @@ def test_debug_data_compile_gold():
ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "B-ENT", "I-ENT"])
eg = Example(pred, ref)
data = _compile_gold([eg], ["ner"], nlp, True)
assert data["boundary_cross_ents"] == 1
assert data["boundary_cross_ents"] == 1