2020-07-22 11:42:59 +00:00
|
|
|
import pytest
|
2023-08-08 09:27:28 +00:00
|
|
|
|
|
|
|
try:
|
|
|
|
from pydantic.v1 import StrictInt, StrictStr
|
|
|
|
except ImportError:
|
|
|
|
from pydantic import StrictInt, StrictStr # type: ignore
|
|
|
|
|
2023-06-14 15:48:41 +00:00
|
|
|
from thinc.api import ConfigValidationError, Linear, Model
|
2021-12-04 19:34:48 +00:00
|
|
|
|
|
|
|
import spacy
|
2020-07-22 11:42:59 +00:00
|
|
|
from spacy.lang.de import German
|
2023-06-14 15:48:41 +00:00
|
|
|
from spacy.lang.en import English
|
|
|
|
from spacy.language import Language
|
2020-10-18 12:50:41 +00:00
|
|
|
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
2020-07-22 11:42:59 +00:00
|
|
|
from spacy.tokens import Doc
|
2023-06-14 15:48:41 +00:00
|
|
|
from spacy.util import SimpleFrozenDict, combine_score_weights, registry
|
2020-07-22 11:42:59 +00:00
|
|
|
|
2020-08-04 21:39:19 +00:00
|
|
|
from ..util import make_tempdir
|
|
|
|
|
2020-07-22 11:42:59 +00:00
|
|
|
|
2021-12-04 19:34:48 +00:00
|
|
|
@pytest.mark.issue(5137)
|
|
|
|
def test_issue5137():
|
|
|
|
factory_name = "test_issue5137"
|
|
|
|
pipe_name = "my_component"
|
|
|
|
|
|
|
|
@Language.factory(factory_name)
|
|
|
|
class MyComponent:
|
|
|
|
def __init__(self, nlp, name=pipe_name, categories="all_categories"):
|
|
|
|
self.nlp = nlp
|
|
|
|
self.categories = categories
|
|
|
|
self.name = name
|
|
|
|
|
|
|
|
def __call__(self, doc):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def to_disk(self, path, **kwargs):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def from_disk(self, path, **cfg):
|
|
|
|
pass
|
|
|
|
|
|
|
|
nlp = English()
|
|
|
|
my_component = nlp.add_pipe(factory_name, name=pipe_name)
|
|
|
|
assert my_component.categories == "all_categories"
|
|
|
|
with make_tempdir() as tmpdir:
|
|
|
|
nlp.to_disk(tmpdir)
|
|
|
|
overrides = {"components": {pipe_name: {"categories": "my_categories"}}}
|
|
|
|
nlp2 = spacy.load(tmpdir, config=overrides)
|
|
|
|
assert nlp2.get_pipe(pipe_name).categories == "my_categories"
|
|
|
|
|
|
|
|
|
2020-07-22 11:42:59 +00:00
|
|
|
def test_pipe_function_component():
|
|
|
|
name = "test_component"
|
|
|
|
|
|
|
|
@Language.component(name)
|
|
|
|
def component(doc: Doc) -> Doc:
|
|
|
|
return doc
|
|
|
|
|
|
|
|
assert name in registry.factories
|
|
|
|
nlp = Language()
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
nlp.add_pipe(component)
|
|
|
|
nlp.add_pipe(name)
|
|
|
|
assert name in nlp.pipe_names
|
|
|
|
assert nlp.pipe_factories[name] == name
|
|
|
|
assert Language.get_factory_meta(name)
|
|
|
|
assert nlp.get_pipe_meta(name)
|
|
|
|
pipe = nlp.get_pipe(name)
|
|
|
|
assert pipe == component
|
|
|
|
pipe = nlp.create_pipe(name)
|
|
|
|
assert pipe == component
|
|
|
|
|
|
|
|
|
|
|
|
def test_pipe_class_component_init():
|
|
|
|
name1 = "test_class_component1"
|
|
|
|
name2 = "test_class_component2"
|
|
|
|
|
|
|
|
@Language.factory(name1)
|
|
|
|
class Component1:
|
|
|
|
def __init__(self, nlp: Language, name: str):
|
|
|
|
self.nlp = nlp
|
|
|
|
|
|
|
|
def __call__(self, doc: Doc) -> Doc:
|
|
|
|
return doc
|
|
|
|
|
|
|
|
class Component2:
|
|
|
|
def __init__(self, nlp: Language, name: str):
|
|
|
|
self.nlp = nlp
|
|
|
|
|
|
|
|
def __call__(self, doc: Doc) -> Doc:
|
|
|
|
return doc
|
|
|
|
|
|
|
|
@Language.factory(name2)
|
|
|
|
def factory(nlp: Language, name=name2):
|
|
|
|
return Component2(nlp, name)
|
|
|
|
|
|
|
|
nlp = Language()
|
|
|
|
for name, Component in [(name1, Component1), (name2, Component2)]:
|
|
|
|
assert name in registry.factories
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
nlp.add_pipe(Component(nlp, name))
|
|
|
|
nlp.add_pipe(name)
|
|
|
|
assert name in nlp.pipe_names
|
|
|
|
assert nlp.pipe_factories[name] == name
|
|
|
|
assert Language.get_factory_meta(name)
|
|
|
|
assert nlp.get_pipe_meta(name)
|
|
|
|
pipe = nlp.get_pipe(name)
|
|
|
|
assert isinstance(pipe, Component)
|
|
|
|
assert isinstance(pipe.nlp, Language)
|
|
|
|
pipe = nlp.create_pipe(name)
|
|
|
|
assert isinstance(pipe, Component)
|
|
|
|
assert isinstance(pipe.nlp, Language)
|
|
|
|
|
|
|
|
|
|
|
|
def test_pipe_class_component_config():
|
|
|
|
name = "test_class_component_config"
|
|
|
|
|
|
|
|
@Language.factory(name)
|
|
|
|
class Component:
|
|
|
|
def __init__(
|
|
|
|
self, nlp: Language, name: str, value1: StrictInt, value2: StrictStr
|
|
|
|
):
|
|
|
|
self.nlp = nlp
|
|
|
|
self.value1 = value1
|
|
|
|
self.value2 = value2
|
|
|
|
self.is_base = True
|
2022-05-12 09:46:08 +00:00
|
|
|
self.name = name
|
2020-07-22 11:42:59 +00:00
|
|
|
|
|
|
|
def __call__(self, doc: Doc) -> Doc:
|
|
|
|
return doc
|
|
|
|
|
|
|
|
@English.factory(name)
|
|
|
|
class ComponentEN:
|
|
|
|
def __init__(
|
|
|
|
self, nlp: Language, name: str, value1: StrictInt, value2: StrictStr
|
|
|
|
):
|
|
|
|
self.nlp = nlp
|
|
|
|
self.value1 = value1
|
|
|
|
self.value2 = value2
|
|
|
|
self.is_base = False
|
|
|
|
|
|
|
|
def __call__(self, doc: Doc) -> Doc:
|
|
|
|
return doc
|
|
|
|
|
|
|
|
nlp = Language()
|
|
|
|
with pytest.raises(ConfigValidationError): # no config provided
|
|
|
|
nlp.add_pipe(name)
|
|
|
|
with pytest.raises(ConfigValidationError): # invalid config
|
|
|
|
nlp.add_pipe(name, config={"value1": "10", "value2": "hello"})
|
2022-05-12 09:46:08 +00:00
|
|
|
with pytest.warns(UserWarning):
|
2022-05-13 17:02:08 +00:00
|
|
|
nlp.add_pipe(
|
|
|
|
name, config={"value1": 10, "value2": "hello", "name": "wrong_name"}
|
|
|
|
)
|
2020-07-22 11:42:59 +00:00
|
|
|
pipe = nlp.get_pipe(name)
|
|
|
|
assert isinstance(pipe.nlp, Language)
|
|
|
|
assert pipe.value1 == 10
|
|
|
|
assert pipe.value2 == "hello"
|
|
|
|
assert pipe.is_base is True
|
2022-05-12 09:46:08 +00:00
|
|
|
assert pipe.name == name
|
2020-07-22 11:42:59 +00:00
|
|
|
|
|
|
|
nlp_en = English()
|
|
|
|
with pytest.raises(ConfigValidationError): # invalid config
|
|
|
|
nlp_en.add_pipe(name, config={"value1": "10", "value2": "hello"})
|
|
|
|
nlp_en.add_pipe(name, config={"value1": 10, "value2": "hello"})
|
|
|
|
pipe = nlp_en.get_pipe(name)
|
|
|
|
assert isinstance(pipe.nlp, English)
|
|
|
|
assert pipe.value1 == 10
|
|
|
|
assert pipe.value2 == "hello"
|
|
|
|
assert pipe.is_base is False
|
|
|
|
|
|
|
|
|
|
|
|
def test_pipe_class_component_defaults():
|
|
|
|
name = "test_class_component_defaults"
|
|
|
|
|
|
|
|
@Language.factory(name)
|
|
|
|
class Component:
|
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
nlp: Language,
|
|
|
|
name: str,
|
🏷 Add Mypy check to CI and ignore all existing Mypy errors (#9167)
* 🚨 Ignore all existing Mypy errors
* 🏗 Add Mypy check to CI
* Add types-mock and types-requests as dev requirements
* Add additional type ignore directives
* Add types packages to dev-only list in reqs test
* Add types-dataclasses for python 3.6
* Add ignore to pretrain
* 🏷 Improve type annotation on `run_command` helper
The `run_command` helper previously declared that it returned an
`Optional[subprocess.CompletedProcess]`, but it isn't actually possible
for the function to return `None`. These changes modify the type
annotation of the `run_command` helper and remove all now-unnecessary
`# type: ignore` directives.
* 🔧 Allow variable type redefinition in limited contexts
These changes modify how Mypy is configured to allow variables to have
their type automatically redefined under certain conditions. The Mypy
documentation contains the following example:
```python
def process(items: List[str]) -> None:
# 'items' has type List[str]
items = [item.split() for item in items]
# 'items' now has type List[List[str]]
...
```
This configuration change is especially helpful in reducing the number
of `# type: ignore` directives needed to handle the common pattern of:
* Accepting a filepath as a string
* Overwriting the variable using `filepath = ensure_path(filepath)`
These changes enable redefinition and remove all `# type: ignore`
directives rendered redundant by this change.
* 🏷 Add type annotation to converters mapping
* 🚨 Fix Mypy error in convert CLI argument verification
* 🏷 Improve type annotation on `resolve_dot_names` helper
* 🏷 Add type annotations for `Vocab` attributes `strings` and `vectors`
* 🏷 Add type annotations for more `Vocab` attributes
* 🏷 Add loose type annotation for gold data compilation
* 🏷 Improve `_format_labels` type annotation
* 🏷 Fix `get_lang_class` type annotation
* 🏷 Loosen return type of `Language.evaluate`
* 🏷 Don't accept `Scorer` in `handle_scores_per_type`
* 🏷 Add `string_to_list` overloads
* 🏷 Fix non-Optional command-line options
* 🙈 Ignore redefinition of `wandb_logger` in `loggers.py`
* ➕ Install `typing_extensions` in Python 3.8+
The `typing_extensions` package states that it should be used when
"writing code that must be compatible with multiple Python versions".
Since SpaCy needs to support multiple Python versions, it should be used
when newer `typing` module members are required. One example of this is
`Literal`, which is available starting with Python 3.8.
Previously SpaCy tried to import `Literal` from `typing`, falling back
to `typing_extensions` if the import failed. However, Mypy doesn't seem
to be able to understand what `Literal` means when the initial import
means. Therefore, these changes modify how `compat` imports `Literal` by
always importing it from `typing_extensions`.
These changes also modify how `typing_extensions` is installed, so that
it is a requirement for all Python versions, including those greater
than or equal to 3.8.
* 🏷 Improve type annotation for `Language.pipe`
These changes add a missing overload variant to the type signature of
`Language.pipe`. Additionally, the type signature is enhanced to allow
type checkers to differentiate between the two overload variants based
on the `as_tuple` parameter.
Fixes #8772
* ➖ Don't install `typing-extensions` in Python 3.8+
After more detailed analysis of how to implement Python version-specific
type annotations using SpaCy, it has been determined that by branching
on a comparison against `sys.version_info` can be statically analyzed by
Mypy well enough to enable us to conditionally use
`typing_extensions.Literal`. This means that we no longer need to
install `typing_extensions` for Python versions greater than or equal to
3.8! 🎉
These changes revert previous changes installing `typing-extensions`
regardless of Python version and modify how we import the `Literal` type
to ensure that Mypy treats it properly.
* resolve mypy errors for Strict pydantic types
* refactor code to avoid missing return statement
* fix types of convert CLI command
* avoid list-set confustion in debug_data
* fix typo and formatting
* small fixes to avoid type ignores
* fix types in profile CLI command and make it more efficient
* type fixes in projects CLI
* put one ignore back
* type fixes for render
* fix render types - the sequel
* fix BaseDefault in language definitions
* fix type of noun_chunks iterator - yields tuple instead of span
* fix types in language-specific modules
* 🏷 Expand accepted inputs of `get_string_id`
`get_string_id` accepts either a string (in which case it returns its
ID) or an ID (in which case it immediately returns the ID). These
changes extend the type annotation of `get_string_id` to indicate that
it can accept either strings or IDs.
* 🏷 Handle override types in `combine_score_weights`
The `combine_score_weights` function allows users to pass an `overrides`
mapping to override data extracted from the `weights` argument. Since it
allows `Optional` dictionary values, the return value may also include
`Optional` dictionary values.
These changes update the type annotations for `combine_score_weights` to
reflect this fact.
* 🏷 Fix tokenizer serialization method signatures in `DummyTokenizer`
* 🏷 Fix redefinition of `wandb_logger`
These changes fix the redefinition of `wandb_logger` by giving a
separate name to each `WandbLogger` version. For
backwards-compatibility, `spacy.train` still exports `wandb_logger_v3`
as `wandb_logger` for now.
* more fixes for typing in language
* type fixes in model definitions
* 🏷 Annotate `_RandomWords.probs` as `NDArray`
* 🏷 Annotate `tok2vec` layers to help Mypy
* 🐛 Fix `_RandomWords.probs` type annotations for Python 3.6
Also remove an import that I forgot to move to the top of the module 😅
* more fixes for matchers and other pipeline components
* quick fix for entity linker
* fixing types for spancat, textcat, etc
* bugfix for tok2vec
* type annotations for scorer
* add runtime_checkable for Protocol
* type and import fixes in tests
* mypy fixes for training utilities
* few fixes in util
* fix import
* 🐵 Remove unused `# type: ignore` directives
* 🏷 Annotate `Language._components`
* 🏷 Annotate `spacy.pipeline.Pipe`
* add doc as property to span.pyi
* small fixes and cleanup
* explicit type annotations instead of via comment
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
2021-10-14 13:21:40 +00:00
|
|
|
value1: StrictInt = StrictInt(10),
|
|
|
|
value2: StrictStr = StrictStr("hello"),
|
2020-07-22 11:42:59 +00:00
|
|
|
):
|
|
|
|
self.nlp = nlp
|
|
|
|
self.value1 = value1
|
|
|
|
self.value2 = value2
|
|
|
|
|
|
|
|
def __call__(self, doc: Doc) -> Doc:
|
|
|
|
return doc
|
|
|
|
|
|
|
|
nlp = Language()
|
|
|
|
nlp.add_pipe(name)
|
|
|
|
pipe = nlp.get_pipe(name)
|
|
|
|
assert isinstance(pipe.nlp, Language)
|
|
|
|
assert pipe.value1 == 10
|
|
|
|
assert pipe.value2 == "hello"
|
|
|
|
|
|
|
|
|
|
|
|
def test_pipe_class_component_model():
|
|
|
|
name = "test_class_component_model"
|
|
|
|
default_config = {
|
|
|
|
"model": {
|
2020-10-18 12:50:41 +00:00
|
|
|
"@architectures": "spacy.TextCatEnsemble.v2",
|
|
|
|
"tok2vec": DEFAULT_TOK2VEC_MODEL,
|
2021-01-05 02:41:53 +00:00
|
|
|
"linear_model": {
|
2023-11-29 08:11:54 +00:00
|
|
|
"@architectures": "spacy.TextCatBOW.v3",
|
2021-01-05 02:41:53 +00:00
|
|
|
"exclusive_classes": False,
|
|
|
|
"ngram_size": 1,
|
|
|
|
"no_output_layer": False,
|
|
|
|
},
|
2020-07-22 11:42:59 +00:00
|
|
|
},
|
|
|
|
"value1": 10,
|
|
|
|
}
|
|
|
|
|
|
|
|
@Language.factory(name, default_config=default_config)
|
|
|
|
class Component:
|
|
|
|
def __init__(self, nlp: Language, model: Model, name: str, value1: StrictInt):
|
|
|
|
self.nlp = nlp
|
|
|
|
self.model = model
|
|
|
|
self.value1 = value1
|
|
|
|
self.name = name
|
|
|
|
|
|
|
|
def __call__(self, doc: Doc) -> Doc:
|
|
|
|
return doc
|
|
|
|
|
|
|
|
nlp = Language()
|
|
|
|
nlp.add_pipe(name)
|
|
|
|
pipe = nlp.get_pipe(name)
|
|
|
|
assert isinstance(pipe.nlp, Language)
|
|
|
|
assert pipe.value1 == 10
|
|
|
|
assert isinstance(pipe.model, Model)
|
|
|
|
|
|
|
|
|
|
|
|
def test_pipe_class_component_model_custom():
|
|
|
|
name = "test_class_component_model_custom"
|
|
|
|
arch = f"{name}.arch"
|
|
|
|
default_config = {"value1": 1, "model": {"@architectures": arch, "nO": 0, "nI": 0}}
|
|
|
|
|
|
|
|
@Language.factory(name, default_config=default_config)
|
|
|
|
class Component:
|
|
|
|
def __init__(
|
2021-10-15 09:36:49 +00:00
|
|
|
self,
|
|
|
|
nlp: Language,
|
|
|
|
model: Model,
|
|
|
|
name: str,
|
|
|
|
value1: StrictInt = StrictInt(10),
|
2020-07-22 11:42:59 +00:00
|
|
|
):
|
|
|
|
self.nlp = nlp
|
|
|
|
self.model = model
|
|
|
|
self.value1 = value1
|
|
|
|
self.name = name
|
|
|
|
|
|
|
|
def __call__(self, doc: Doc) -> Doc:
|
|
|
|
return doc
|
|
|
|
|
|
|
|
@registry.architectures(arch)
|
|
|
|
def make_custom_arch(nO: StrictInt, nI: StrictInt):
|
|
|
|
return Linear(nO, nI)
|
|
|
|
|
|
|
|
nlp = Language()
|
|
|
|
config = {"value1": 20, "model": {"@architectures": arch, "nO": 1, "nI": 2}}
|
|
|
|
nlp.add_pipe(name, config=config)
|
|
|
|
pipe = nlp.get_pipe(name)
|
|
|
|
assert isinstance(pipe.nlp, Language)
|
|
|
|
assert pipe.value1 == 20
|
|
|
|
assert isinstance(pipe.model, Model)
|
|
|
|
assert pipe.model.name == "linear"
|
|
|
|
|
|
|
|
nlp = Language()
|
|
|
|
with pytest.raises(ConfigValidationError):
|
|
|
|
config = {"value1": "20", "model": {"@architectures": arch, "nO": 1, "nI": 2}}
|
|
|
|
nlp.add_pipe(name, config=config)
|
|
|
|
with pytest.raises(ConfigValidationError):
|
|
|
|
config = {"value1": 20, "model": {"@architectures": arch, "nO": 1.0, "nI": 2.0}}
|
|
|
|
nlp.add_pipe(name, config=config)
|
|
|
|
|
|
|
|
|
|
|
|
def test_pipe_factories_wrong_formats():
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
# Decorator is not called
|
|
|
|
@Language.component
|
|
|
|
def component(foo: int, bar: str):
|
|
|
|
...
|
|
|
|
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
# Decorator is not called
|
|
|
|
@Language.factory
|
|
|
|
def factory1(foo: int, bar: str):
|
|
|
|
...
|
|
|
|
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
# Factory function is missing "nlp" and "name" arguments
|
|
|
|
@Language.factory("test_pipe_factories_missing_args")
|
|
|
|
def factory2(foo: int, bar: str):
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
def test_pipe_factory_meta_config_cleanup():
|
|
|
|
"""Test that component-specific meta and config entries are represented
|
|
|
|
correctly and cleaned up when pipes are removed, replaced or renamed."""
|
|
|
|
nlp = Language()
|
|
|
|
nlp.add_pipe("ner", name="ner_component")
|
|
|
|
nlp.add_pipe("textcat")
|
|
|
|
assert nlp.get_factory_meta("ner")
|
|
|
|
assert nlp.get_pipe_meta("ner_component")
|
|
|
|
assert nlp.get_pipe_config("ner_component")
|
|
|
|
assert nlp.get_factory_meta("textcat")
|
|
|
|
assert nlp.get_pipe_meta("textcat")
|
|
|
|
assert nlp.get_pipe_config("textcat")
|
|
|
|
nlp.rename_pipe("textcat", "tc")
|
|
|
|
assert nlp.get_pipe_meta("tc")
|
|
|
|
assert nlp.get_pipe_config("tc")
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
nlp.remove_pipe("ner")
|
|
|
|
nlp.remove_pipe("ner_component")
|
|
|
|
assert "ner_component" not in nlp._pipe_meta
|
|
|
|
assert "ner_component" not in nlp._pipe_configs
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
nlp.replace_pipe("textcat", "parser")
|
|
|
|
nlp.replace_pipe("tc", "parser")
|
|
|
|
assert nlp.get_factory_meta("parser")
|
|
|
|
assert nlp.get_pipe_meta("tc").factory == "parser"
|
|
|
|
|
|
|
|
|
|
|
|
def test_pipe_factories_empty_dict_default():
|
|
|
|
"""Test that default config values can be empty dicts and that no config
|
|
|
|
validation error is raised."""
|
|
|
|
# TODO: fix this
|
|
|
|
name = "test_pipe_factories_empty_dict_default"
|
|
|
|
|
|
|
|
@Language.factory(name, default_config={"foo": {}})
|
|
|
|
def factory(nlp: Language, name: str, foo: dict):
|
|
|
|
...
|
|
|
|
|
|
|
|
nlp = Language()
|
|
|
|
nlp.create_pipe(name)
|
|
|
|
|
|
|
|
|
|
|
|
def test_pipe_factories_language_specific():
|
|
|
|
"""Test that language sub-classes can have their own factories, with
|
|
|
|
fallbacks to the base factories."""
|
|
|
|
name1 = "specific_component1"
|
|
|
|
name2 = "specific_component2"
|
|
|
|
Language.component(name1, func=lambda: "base")
|
|
|
|
English.component(name1, func=lambda: "en")
|
|
|
|
German.component(name2, func=lambda: "de")
|
|
|
|
|
|
|
|
assert Language.has_factory(name1)
|
|
|
|
assert not Language.has_factory(name2)
|
|
|
|
assert English.has_factory(name1)
|
|
|
|
assert not English.has_factory(name2)
|
|
|
|
assert German.has_factory(name1)
|
|
|
|
assert German.has_factory(name2)
|
|
|
|
|
|
|
|
nlp = Language()
|
|
|
|
assert nlp.create_pipe(name1)() == "base"
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
nlp.create_pipe(name2)
|
|
|
|
nlp_en = English()
|
|
|
|
assert nlp_en.create_pipe(name1)() == "en"
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
nlp_en.create_pipe(name2)
|
|
|
|
nlp_de = German()
|
|
|
|
assert nlp_de.create_pipe(name1)() == "base"
|
|
|
|
assert nlp_de.create_pipe(name2)() == "de"
|
|
|
|
|
|
|
|
|
|
|
|
def test_language_factories_invalid():
|
|
|
|
"""Test that assigning directly to Language.factories is now invalid and
|
|
|
|
raises a custom error."""
|
|
|
|
assert isinstance(Language.factories, SimpleFrozenDict)
|
|
|
|
with pytest.raises(NotImplementedError):
|
|
|
|
Language.factories["foo"] = "bar"
|
|
|
|
nlp = Language()
|
|
|
|
assert isinstance(nlp.factories, SimpleFrozenDict)
|
|
|
|
assert len(nlp.factories)
|
|
|
|
with pytest.raises(NotImplementedError):
|
|
|
|
nlp.factories["foo"] = "bar"
|
2020-07-26 11:18:43 +00:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
2021-04-26 14:53:38 +00:00
|
|
|
"weights,override,expected",
|
2020-07-26 11:18:43 +00:00
|
|
|
[
|
2021-04-26 14:53:38 +00:00
|
|
|
([{"a": 1.0}, {"b": 1.0}, {"c": 1.0}], {}, {"a": 0.33, "b": 0.33, "c": 0.33}),
|
|
|
|
([{"a": 1.0}, {"b": 50}, {"c": 100}], {}, {"a": 0.01, "b": 0.33, "c": 0.66}),
|
2020-07-26 11:18:43 +00:00
|
|
|
(
|
|
|
|
[{"a": 0.7, "b": 0.3}, {"c": 1.0}, {"d": 0.5, "e": 0.5}],
|
2021-04-26 14:53:38 +00:00
|
|
|
{},
|
2020-07-26 11:18:43 +00:00
|
|
|
{"a": 0.23, "b": 0.1, "c": 0.33, "d": 0.17, "e": 0.17},
|
|
|
|
),
|
|
|
|
(
|
2021-04-26 14:53:38 +00:00
|
|
|
[{"a": 100, "b": 300}, {"c": 50, "d": 50}],
|
|
|
|
{},
|
|
|
|
{"a": 0.2, "b": 0.6, "c": 0.1, "d": 0.1},
|
2020-07-26 11:18:43 +00:00
|
|
|
),
|
2021-04-26 14:53:38 +00:00
|
|
|
([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {}, {"a": 0.33, "b": 0.67}),
|
|
|
|
([{"a": 0.5, "b": 0.0}], {}, {"a": 1.0, "b": 0.0}),
|
|
|
|
([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.0}, {"a": 0.0, "b": 1.0}),
|
|
|
|
([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {}, {"a": 0.0, "b": 0.0, "c": 0.0}),
|
|
|
|
([{"a": 0.0, "b": 0.0}, {"c": 1.0}], {}, {"a": 0.0, "b": 0.0, "c": 1.0}),
|
2021-06-28 09:48:00 +00:00
|
|
|
(
|
|
|
|
[{"a": 0.0, "b": 0.0}, {"c": 0.0}],
|
|
|
|
{"c": 0.2},
|
|
|
|
{"a": 0.0, "b": 0.0, "c": 1.0},
|
|
|
|
),
|
|
|
|
(
|
|
|
|
[{"a": 0.5, "b": 0.5, "c": 1.0, "d": 1.0}],
|
|
|
|
{"a": 0.0, "b": 0.0},
|
|
|
|
{"a": 0.0, "b": 0.0, "c": 0.5, "d": 0.5},
|
|
|
|
),
|
|
|
|
(
|
|
|
|
[{"a": 0.5, "b": 0.5, "c": 1.0, "d": 1.0}],
|
|
|
|
{"a": 0.0, "b": 0.0, "f": 0.0},
|
|
|
|
{"a": 0.0, "b": 0.0, "c": 0.5, "d": 0.5, "f": 0.0},
|
|
|
|
),
|
2020-07-26 11:18:43 +00:00
|
|
|
],
|
|
|
|
)
|
2021-04-26 14:53:38 +00:00
|
|
|
def test_language_factories_combine_score_weights(weights, override, expected):
|
|
|
|
result = combine_score_weights(weights, override)
|
2020-09-24 14:42:13 +00:00
|
|
|
assert sum(result.values()) in (0.99, 1.0, 0.0)
|
2020-07-26 11:18:43 +00:00
|
|
|
assert result == expected
|
|
|
|
|
|
|
|
|
|
|
|
def test_language_factories_scores():
|
|
|
|
name = "test_language_factories_scores"
|
2020-08-17 14:45:24 +00:00
|
|
|
func = lambda nlp, name: lambda doc: doc
|
2020-07-26 11:18:43 +00:00
|
|
|
weights1 = {"a1": 0.5, "a2": 0.5}
|
|
|
|
weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1}
|
2020-09-24 08:27:33 +00:00
|
|
|
Language.factory(f"{name}1", default_score_weights=weights1, func=func)
|
|
|
|
Language.factory(f"{name}2", default_score_weights=weights2, func=func)
|
2020-07-26 11:18:43 +00:00
|
|
|
meta1 = Language.get_factory_meta(f"{name}1")
|
2020-07-27 10:27:40 +00:00
|
|
|
assert meta1.default_score_weights == weights1
|
2020-07-26 11:18:43 +00:00
|
|
|
meta2 = Language.get_factory_meta(f"{name}2")
|
2020-07-27 10:27:40 +00:00
|
|
|
assert meta2.default_score_weights == weights2
|
2020-07-26 11:40:19 +00:00
|
|
|
nlp = Language()
|
|
|
|
nlp._config["training"]["score_weights"] = {}
|
2020-07-26 11:18:43 +00:00
|
|
|
nlp.add_pipe(f"{name}1")
|
|
|
|
nlp.add_pipe(f"{name}2")
|
|
|
|
cfg = nlp.config["training"]
|
|
|
|
expected_weights = {"a1": 0.25, "a2": 0.25, "b1": 0.1, "b2": 0.35, "b3": 0.05}
|
|
|
|
assert cfg["score_weights"] == expected_weights
|
2020-09-24 08:27:33 +00:00
|
|
|
# Test with custom defaults
|
|
|
|
config = nlp.config.copy()
|
|
|
|
config["training"]["score_weights"]["a1"] = 0.0
|
2021-04-26 14:53:38 +00:00
|
|
|
config["training"]["score_weights"]["b3"] = 1.3
|
2020-09-24 08:27:33 +00:00
|
|
|
nlp = English.from_config(config)
|
|
|
|
score_weights = nlp.config["training"]["score_weights"]
|
2021-04-26 14:53:38 +00:00
|
|
|
expected = {"a1": 0.0, "a2": 0.12, "b1": 0.05, "b2": 0.17, "b3": 0.65}
|
2020-09-24 08:27:33 +00:00
|
|
|
assert score_weights == expected
|
|
|
|
# Test with null values
|
|
|
|
config = nlp.config.copy()
|
|
|
|
config["training"]["score_weights"]["a1"] = None
|
|
|
|
nlp = English.from_config(config)
|
|
|
|
score_weights = nlp.config["training"]["score_weights"]
|
2021-04-26 14:53:38 +00:00
|
|
|
expected = {"a1": None, "a2": 0.12, "b1": 0.05, "b2": 0.17, "b3": 0.66}
|
2020-09-24 08:27:33 +00:00
|
|
|
assert score_weights == expected
|
2020-08-04 21:39:19 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_pipe_factories_from_source():
|
|
|
|
"""Test adding components from a source model."""
|
|
|
|
source_nlp = English()
|
|
|
|
source_nlp.add_pipe("tagger", name="my_tagger")
|
|
|
|
nlp = English()
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
nlp.add_pipe("my_tagger", source="en_core_web_sm")
|
|
|
|
nlp.add_pipe("my_tagger", source=source_nlp)
|
|
|
|
assert "my_tagger" in nlp.pipe_names
|
|
|
|
with pytest.raises(KeyError):
|
|
|
|
nlp.add_pipe("custom", source=source_nlp)
|
|
|
|
|
|
|
|
|
2021-04-19 08:36:32 +00:00
|
|
|
def test_pipe_factories_from_source_language_subclass():
|
|
|
|
class CustomEnglishDefaults(English.Defaults):
|
|
|
|
stop_words = set(["custom", "stop"])
|
|
|
|
|
|
|
|
@registry.languages("custom_en")
|
|
|
|
class CustomEnglish(English):
|
|
|
|
lang = "custom_en"
|
|
|
|
Defaults = CustomEnglishDefaults
|
|
|
|
|
|
|
|
source_nlp = English()
|
|
|
|
source_nlp.add_pipe("tagger")
|
|
|
|
|
|
|
|
# custom subclass
|
|
|
|
nlp = CustomEnglish()
|
|
|
|
nlp.add_pipe("tagger", source=source_nlp)
|
|
|
|
assert "tagger" in nlp.pipe_names
|
|
|
|
|
|
|
|
# non-subclass
|
|
|
|
nlp = German()
|
|
|
|
nlp.add_pipe("tagger", source=source_nlp)
|
|
|
|
assert "tagger" in nlp.pipe_names
|
|
|
|
|
|
|
|
# mismatched vectors
|
|
|
|
nlp = English()
|
|
|
|
nlp.vocab.vectors.resize((1, 4))
|
|
|
|
nlp.vocab.vectors.add("cat", vector=[1, 2, 3, 4])
|
2021-06-04 15:44:04 +00:00
|
|
|
with pytest.warns(UserWarning):
|
2021-04-19 08:36:32 +00:00
|
|
|
nlp.add_pipe("tagger", source=source_nlp)
|
|
|
|
|
|
|
|
|
2020-08-04 21:39:19 +00:00
|
|
|
def test_pipe_factories_from_source_custom():
|
|
|
|
"""Test adding components from a source model with custom components."""
|
|
|
|
name = "test_pipe_factories_from_source_custom"
|
|
|
|
|
|
|
|
@Language.factory(name, default_config={"arg": "hello"})
|
|
|
|
def test_factory(nlp, name, arg: str):
|
|
|
|
return lambda doc: doc
|
|
|
|
|
|
|
|
source_nlp = English()
|
|
|
|
source_nlp.add_pipe("tagger")
|
|
|
|
source_nlp.add_pipe(name, config={"arg": "world"})
|
|
|
|
nlp = English()
|
|
|
|
nlp.add_pipe(name, source=source_nlp)
|
|
|
|
assert name in nlp.pipe_names
|
|
|
|
assert nlp.get_pipe_meta(name).default_config["arg"] == "hello"
|
|
|
|
config = nlp.config["components"][name]
|
|
|
|
assert config["factory"] == name
|
|
|
|
assert config["arg"] == "world"
|
|
|
|
|
|
|
|
|
|
|
|
def test_pipe_factories_from_source_config():
|
|
|
|
name = "test_pipe_factories_from_source_config"
|
|
|
|
|
|
|
|
@Language.factory(name, default_config={"arg": "hello"})
|
|
|
|
def test_factory(nlp, name, arg: str):
|
|
|
|
return lambda doc: doc
|
|
|
|
|
|
|
|
source_nlp = English()
|
|
|
|
source_nlp.add_pipe("tagger")
|
|
|
|
source_nlp.add_pipe(name, name="yolo", config={"arg": "world"})
|
|
|
|
dest_nlp_cfg = {"lang": "en", "pipeline": ["parser", "custom"]}
|
|
|
|
with make_tempdir() as tempdir:
|
|
|
|
source_nlp.to_disk(tempdir)
|
|
|
|
dest_components_cfg = {
|
|
|
|
"parser": {"factory": "parser"},
|
|
|
|
"custom": {"source": str(tempdir), "component": "yolo"},
|
|
|
|
}
|
|
|
|
dest_config = {"nlp": dest_nlp_cfg, "components": dest_components_cfg}
|
|
|
|
nlp = English.from_config(dest_config)
|
|
|
|
assert nlp.pipe_names == ["parser", "custom"]
|
|
|
|
assert nlp.pipe_factories == {"parser": "parser", "custom": name}
|
|
|
|
meta = nlp.get_pipe_meta("custom")
|
|
|
|
assert meta.factory == name
|
|
|
|
assert meta.default_config["arg"] == "hello"
|
|
|
|
config = nlp.config["components"]["custom"]
|
|
|
|
assert config["factory"] == name
|
|
|
|
assert config["arg"] == "world"
|
2020-08-28 14:27:22 +00:00
|
|
|
|
|
|
|
|
2021-02-10 03:12:00 +00:00
|
|
|
class PipeFactoriesIdempotent:
|
|
|
|
def __init__(self, nlp, name):
|
|
|
|
...
|
|
|
|
|
|
|
|
def __call__(self, doc):
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"i,func,func2",
|
|
|
|
[
|
|
|
|
(0, lambda nlp, name: lambda doc: doc, lambda doc: doc),
|
|
|
|
(1, PipeFactoriesIdempotent, PipeFactoriesIdempotent(None, None)),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_pipe_factories_decorator_idempotent(i, func, func2):
|
2020-08-28 14:27:22 +00:00
|
|
|
"""Check that decorator can be run multiple times if the function is the
|
|
|
|
same. This is especially relevant for live reloading because we don't
|
|
|
|
want spaCy to raise an error if a module registering components is reloaded.
|
|
|
|
"""
|
2021-02-10 03:12:00 +00:00
|
|
|
name = f"test_pipe_factories_decorator_idempotent_{i}"
|
2020-08-28 14:27:22 +00:00
|
|
|
for i in range(5):
|
|
|
|
Language.factory(name, func=func)
|
|
|
|
nlp = Language()
|
|
|
|
nlp.add_pipe(name)
|
|
|
|
Language.factory(name, func=func)
|
|
|
|
# Make sure it also works for component decorator, which creates the
|
|
|
|
# factory function
|
|
|
|
name2 = f"{name}2"
|
|
|
|
for i in range(5):
|
|
|
|
Language.component(name2, func=func2)
|
|
|
|
nlp = Language()
|
|
|
|
nlp.add_pipe(name)
|
|
|
|
Language.component(name2, func=func2)
|
2020-09-15 12:24:17 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_pipe_factories_config_excludes_nlp():
|
|
|
|
"""Test that the extra values we temporarily add to component config
|
|
|
|
blocks/functions are removed and not copied around.
|
|
|
|
"""
|
|
|
|
name = "test_pipe_factories_config_excludes_nlp"
|
|
|
|
func = lambda nlp, name: lambda doc: doc
|
|
|
|
Language.factory(name, func=func)
|
|
|
|
config = {
|
|
|
|
"nlp": {"lang": "en", "pipeline": [name]},
|
|
|
|
"components": {name: {"factory": name}},
|
|
|
|
}
|
|
|
|
nlp = English.from_config(config)
|
|
|
|
assert nlp.pipe_names == [name]
|
|
|
|
pipe_cfg = nlp.get_pipe_config(name)
|
|
|
|
pipe_cfg == {"factory": name}
|
|
|
|
assert nlp._pipe_configs[name] == {"factory": name}
|