Tidy up docstrings and arguments

This commit is contained in:
Ines Montani 2020-07-28 23:12:42 +02:00
parent 256b24b720
commit e5d9eaf79c
1 changed files with 82 additions and 23 deletions

View File

@ -36,6 +36,7 @@ from . import util
from . import about
# TODO: integrate pipeline analyis
ENABLE_PIPELINE_ANALYSIS = False
# This is the base config will all settings (training etc.)
DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg"
@ -43,6 +44,10 @@ DEFAULT_CONFIG = Config().from_disk(DEFAULT_CONFIG_PATH)
class BaseDefaults:
"""Language data defaults, available via Language.Defaults. Can be
overwritten by language subclasses by defining their own subclasses of
Language.Defaults.
"""
config: Config = Config()
tokenizer_exceptions: Dict[str, List[dict]] = BASE_EXCEPTIONS
prefixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_PREFIXES
@ -58,6 +63,9 @@ class BaseDefaults:
@registry.tokenizers("spacy.Tokenizer.v1")
def create_tokenizer() -> Callable[["Language"], Tokenizer]:
"""Registered function to create a tokenizer. Returns a factory that takes
the nlp object and returns a Tokenizer instance using the language detaults.
"""
def tokenizer_factory(nlp: "Language") -> Tokenizer:
prefixes = nlp.Defaults.prefixes
suffixes = nlp.Defaults.suffixes
@ -80,6 +88,11 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
@registry.lemmatizers("spacy.Lemmatizer.v1")
def create_lemmatizer() -> Callable[["Language"], "Lemmatizer"]:
"""Registered function to create a lemmatizer. Returns a factory that takes
the nlp object and returns a Lemmatizer instance with data loaded in from
spacy-lookups-data, if the package is installed.
"""
# TODO: Will be replaced when the lemmatizer becomes a pipeline component
tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
def lemmatizer_factory(nlp: "Language") -> "Lemmatizer":
@ -116,7 +129,7 @@ class Language:
create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
create_lemmatizer: Optional[Callable[["Language"], Callable]] = None,
**kwargs,
):
) -> None:
"""Initialise a Language object.
vocab (Vocab): A `Vocab` object. If `True`, a vocab is created.
@ -134,7 +147,8 @@ class Language:
returns a tokenizer.
create_lemmatizer (Callable): Function that takes the nlp object and
returns a lemmatizer.
RETURNS (Language): The newly constructed object.
DOCS: https://spacy.io/api/language#init
"""
# We're only calling this to import all factories provided via entry
# points. The factory decorator applied to these functions takes care
@ -189,6 +203,13 @@ class Language:
@property
def meta(self) -> Dict[str, Any]:
"""Custom meta data of the language class. If a model is loaded, this
includes details from the model's meta.json.
RETURNS (Dict[str, Any]): The meta.
DOCS: https://spacy.io/api/language#meta
"""
spacy_version = util.get_model_version_range(about.__version__)
if self.vocab.lang:
self._meta.setdefault("lang", self.vocab.lang)
@ -221,6 +242,13 @@ class Language:
@property
def config(self) -> Config:
"""Trainable config for the current language instance. Includes the
current pipeline components, as well as default training config.
RETURNS (thinc.api.Config): The config.
DOCS: https://spacy.io/api/language#config
"""
self._config.setdefault("nlp", {})
self._config.setdefault("training", {})
self._config["nlp"]["lang"] = self.lang
@ -382,6 +410,8 @@ class Language:
select the best model. Weights should sum to 1.0 per component and
will be combined and normalized for the whole pipeline.
func (Optional[Callable]): Factory function if not used as a decorator.
DOCS: https://spacy.io/api/language#factory
"""
if not isinstance(name, str):
raise ValueError(Errors.E963.format(decorator="factory"))
@ -460,6 +490,8 @@ class Language:
select the best model. Weights should sum to 1.0 per component and
will be combined and normalized for the whole pipeline.
func (Optional[Callable]): Factory function if not used as a decorator.
DOCS: https://spacy.io/api/language#component
"""
if name is not None and not isinstance(name, str):
raise ValueError(Errors.E963.format(decorator="component"))
@ -504,6 +536,7 @@ class Language:
self,
factory_name: str,
name: Optional[str] = None,
*,
config: Optional[Dict[str, Any]] = SimpleFrozenDict(),
overrides: Optional[Dict[str, Any]] = SimpleFrozenDict(),
validate: bool = True,
@ -521,6 +554,8 @@ class Language:
validate (bool): Whether to validate the component config against the
arguments and types expected by the factory.
RETURNS (Callable[[Doc], Doc]): The pipeline component.
DOCS: https://spacy.io/api/language#create_pipe
"""
name = name if name is not None else factory_name
if not isinstance(config, dict):
@ -692,6 +727,7 @@ class Language:
self,
name: str,
factory_name: str,
*,
config: Dict[str, Any] = SimpleFrozenDict(),
validate: bool = True,
) -> None:
@ -761,6 +797,7 @@ class Language:
def __call__(
self,
text: str,
*,
disable: Iterable[str] = tuple(),
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
) -> Doc:
@ -770,8 +807,8 @@ class Language:
text (str): The text to be processed.
disable (list): Names of the pipeline components to disable.
component_cfg (dict): An optional dictionary with extra keyword arguments
for specific components.
component_cfg (Dict[str, dict]): An optional dictionary with extra
keyword arguments for specific components.
RETURNS (Doc): A container for accessing the annotations.
DOCS: https://spacy.io/api/language#call
@ -811,6 +848,7 @@ class Language:
def select_pipes(
self,
*,
disable: Optional[Union[str, Iterable[str]]] = None,
enable: Optional[Union[str, Iterable[str]]] = None,
) -> "DisabledPipes":
@ -853,7 +891,7 @@ class Language:
def update(
self,
examples: Iterable[Example],
dummy: Optional[Any] = None,
_: Optional[Any] = None,
*,
drop: float = 0.0,
sgd: Optional[Optimizer] = None,
@ -863,7 +901,7 @@ class Language:
"""Update the models in the pipeline.
examples (Iterable[Example]): A batch of examples
dummy: Should not be set - serves to catch backwards-incompatible scripts.
_: Should not be set - serves to catch backwards-incompatible scripts.
drop (float): The dropout rate.
sgd (Optimizer): An optimizer.
losses (Dict[str, float]): Dictionary to update with the loss, keyed by component.
@ -873,7 +911,7 @@ class Language:
DOCS: https://spacy.io/api/language#update
"""
if dummy is not None:
if _ is not None:
raise ValueError(Errors.E989)
if losses is None:
losses = {}
@ -890,12 +928,10 @@ class Language:
raise TypeError(
Errors.E978.format(name="language", method="update", types=wrong_types)
)
if sgd is None:
if self._optimizer is None:
self._optimizer = create_default_optimizer()
sgd = self._optimizer
if component_cfg is None:
component_cfg = {}
for i, (name, proc) in enumerate(self.pipeline):
@ -915,6 +951,7 @@ class Language:
def rehearse(
self,
examples: Iterable[Example],
*,
sgd: Optional[Optimizer] = None,
losses: Optional[Dict[str, float]] = None,
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
@ -937,8 +974,9 @@ class Language:
>>> nlp.update(labelled_batch)
>>> raw_batch = [Example.from_dict(nlp.make_doc(text), {}) for text in next(raw_text_batches)]
>>> nlp.rehearse(raw_batch)
DOCS: https://spacy.io/api/language#rehearse
"""
# TODO: document
if len(examples) == 0:
return
if not isinstance(examples, IterableInstance):
@ -983,17 +1021,18 @@ class Language:
def begin_training(
self,
get_examples: Optional[Callable] = None,
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
*,
sgd: Optional[Optimizer] = None,
device: int = -1,
) -> Optimizer:
"""Allocate models, pre-process training data and acquire a trainer and
optimizer. Used as a contextmanager.
"""Initialize the pipe for training, using data examples if available.
get_examples (function): Function returning example training data.
TODO: document format change since 3.0.
sgd (Optional[Optimizer]): An optimizer.
RETURNS: An optimizer.
get_examples (Callable[[], Iterable[Example]]): Optional function that
returns gold-standard Example objects.
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://spacy.io/api/language#begin_training
"""
@ -1022,18 +1061,20 @@ class Language:
return self._optimizer
def resume_training(
self, sgd: Optional[Optimizer] = None, device: int = -1
self, *, sgd: Optional[Optimizer] = None, device: int = -1
) -> Optimizer:
"""Continue training a pretrained model.
Create and return an optimizer, and initialize "rehearsal" for any pipeline
component that has a .rehearse() method. Rehearsal is used to prevent
models from "forgetting" their initialised "knowledge". To perform
models from "forgetting" their initialized "knowledge". To perform
rehearsal, collect samples of text you want the models to retain performance
on, and call nlp.rehearse() with a batch of Example objects.
sgd (Optional[Optimizer]): An optimizer.
RETURNS (Optimizer): The optimizer.
DOCS: https://spacy.io/api/language#resume_training
"""
if device >= 0: # TODO: do we need this here?
require_gpu(device)
@ -1052,11 +1093,12 @@ class Language:
def evaluate(
self,
examples: Iterable[Example],
*,
verbose: bool = False,
batch_size: int = 256,
scorer: Optional[Scorer] = None,
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
) -> Scorer:
) -> Dict[str, Union[float, dict]]:
"""Evaluate a model's pipeline components.
examples (Iterable[Example]): `Example` objects.
@ -1112,7 +1154,9 @@ class Language:
EXAMPLE:
>>> with nlp.use_params(optimizer.averages):
>>> nlp.to_disk('/tmp/checkpoint')
>>> nlp.to_disk("/tmp/checkpoint")
DOCS: https://spacy.io/api/language#use_params
"""
contexts = [
pipe.use_params(params)
@ -1136,6 +1180,7 @@ class Language:
def pipe(
self,
texts: Iterable[str],
*,
as_tuples: bool = False,
batch_size: int = 1000,
disable: Iterable[str] = tuple(),
@ -1305,6 +1350,16 @@ class Language:
"""Create the nlp object from a loaded config. Will set up the tokenizer
and language data, add pipeline components etc. If no config is provided,
the default config of the given language is used.
config (Dict[str, Any] / Config): The loaded config.
disable (Iterable[str]): List of pipeline component names to disable.
auto_fill (bool): Automatically fill in missing values in config based
on defaults and function argument annotations.
validate (bool): Validate the component config and arguments against
the types expected by the factory.
RETURNS (Language): The initialized Language class.
DOCS: https://spacy.io/api/language#from_config
"""
if auto_fill:
config = util.deep_merge_configs(config, cls.default_config)
@ -1418,7 +1473,6 @@ class Language:
_fix_pretrained_vectors_name(self)
path = util.ensure_path(path)
deserializers = {}
if Path(path / "config.cfg").exists():
deserializers["config.cfg"] = lambda p: self.config.from_disk(p)
@ -1509,6 +1563,11 @@ class Language:
@dataclass
class FactoryMeta:
"""Dataclass containing information about a component and its defaults
provided by the @Language.component or @Language.factory decorator. It's
created whenever a component is defined and stored on the Language class for
each component instance and factory instance.
"""
factory: str
default_config: Optional[Dict[str, Any]] = None # noqa: E704
assigns: Iterable[str] = tuple()
@ -1551,7 +1610,7 @@ def _fix_pretrained_vectors_name(nlp: Language) -> None:
class DisabledPipes(list):
"""Manager for temporary pipeline disabling."""
def __init__(self, nlp: Language, names: List[str]):
def __init__(self, nlp: Language, names: List[str]) -> None:
self.nlp = nlp
self.names = names
# Important! Not deep copy -- we just want the container (but we also