From b0f57a0cac93d2fe3862cb7e33bdf75fbed1d121 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 29 Jul 2020 15:14:07 +0200 Subject: [PATCH] Update docs and consistency --- spacy/language.py | 13 +- spacy/pipeline/entity_linker.py | 6 +- spacy/pipeline/entityruler.py | 10 +- spacy/pipeline/morphologizer.pyx | 8 +- spacy/pipeline/pipe.pyx | 8 +- spacy/pipeline/sentencizer.pyx | 8 +- spacy/pipeline/senter.pyx | 8 +- spacy/pipeline/tagger.pyx | 8 +- spacy/tokenizer.pyx | 8 +- spacy/tokens/doc.pyx | 28 +-- spacy/vocab.pyx | 8 +- website/docs/api/dependencyparser.md | 40 +-- website/docs/api/doc.md | 40 +-- website/docs/api/entitylinker.md | 20 +- website/docs/api/entityrecognizer.md | 40 +-- website/docs/api/language.md | 40 +-- website/docs/api/morphologizer.md | 40 +-- website/docs/api/pipe.md | 40 +-- website/docs/api/sentencerecognizer.md | 40 +-- website/docs/api/tagger.md | 40 +-- website/docs/api/textcategorizer.md | 40 +-- website/docs/api/tok2vec.md | 40 +-- website/docs/api/tokenizer.md | 40 +-- website/docs/api/transformer.md | 331 +++++++++++++++++++++++-- website/docs/api/vocab.md | 40 +-- 25 files changed, 646 insertions(+), 298 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 9dd8a347e..ef185a7eb 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -49,6 +49,7 @@ class BaseDefaults: overwritten by language subclasses by defining their own subclasses of Language.Defaults. """ + config: Config = Config() tokenizer_exceptions: Dict[str, List[dict]] = BASE_EXCEPTIONS prefixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_PREFIXES @@ -67,6 +68,7 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]: """Registered function to create a tokenizer. Returns a factory that takes the nlp object and returns a Tokenizer instance using the language detaults. """ + def tokenizer_factory(nlp: "Language") -> Tokenizer: prefixes = nlp.Defaults.prefixes suffixes = nlp.Defaults.suffixes @@ -1432,7 +1434,9 @@ class Language: nlp.resolved = resolved return nlp - def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = tuple()) -> None: + def to_disk( + self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() + ) -> None: """Save the current state to a directory. If a model is loaded, this will include the model. @@ -1461,7 +1465,7 @@ class Language: util.to_disk(path, serializers, exclude) def from_disk( - self, path: Union[str, Path], exclude: Iterable[str] = tuple() + self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() ) -> "Language": """Loads state from a directory. Modifies the object in place and returns it. If the saved `Language` object contains a model, the @@ -1512,7 +1516,7 @@ class Language: self._link_components() return self - def to_bytes(self, exclude: Iterable[str] = tuple()) -> bytes: + def to_bytes(self, *, exclude: Iterable[str] = tuple()) -> bytes: """Serialize the current state to a binary string. exclude (list): Names of components or serialization fields to exclude. @@ -1534,7 +1538,7 @@ class Language: return util.to_bytes(serializers, exclude) def from_bytes( - self, bytes_data: bytes, exclude: Iterable[str] = tuple() + self, bytes_data: bytes, *, exclude: Iterable[str] = tuple() ) -> "Language": """Load state from a binary string. @@ -1583,6 +1587,7 @@ class FactoryMeta: created whenever a component is defined and stored on the Language class for each component instance and factory instance. """ + factory: str default_config: Optional[Dict[str, Any]] = None # noqa: E704 assigns: Iterable[str] = tuple() diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 45713108a..cc4e7b159 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -400,7 +400,9 @@ class EntityLinker(Pipe): for token in ent: token.ent_kb_id_ = kb_id - def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = tuple()) -> None: + def to_disk( + self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() + ) -> None: """Serialize the pipe to disk. path (str / Path): Path to a directory. @@ -417,7 +419,7 @@ class EntityLinker(Pipe): util.to_disk(path, serialize, exclude) def from_disk( - self, path: Union[str, Path], exclude: Iterable[str] = tuple() + self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() ) -> "EntityLinker": """Load the pipe from disk. Modifies the object in place and returns it. diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index d6ce86e78..8f280547e 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -315,7 +315,7 @@ class EntityRuler: return Scorer.score_spans(examples, "ents", **kwargs) def from_bytes( - self, patterns_bytes: bytes, exclude: Iterable[str] = tuple() + self, patterns_bytes: bytes, *, exclude: Iterable[str] = tuple() ) -> "EntityRuler": """Load the entity ruler from a bytestring. @@ -339,7 +339,7 @@ class EntityRuler: self.add_patterns(cfg) return self - def to_bytes(self, exclude: Iterable[str] = tuple()) -> bytes: + def to_bytes(self, *, exclude: Iterable[str] = tuple()) -> bytes: """Serialize the entity ruler patterns to a bytestring. RETURNS (bytes): The serialized patterns. @@ -355,7 +355,7 @@ class EntityRuler: return srsly.msgpack_dumps(serial) def from_disk( - self, path: Union[str, Path], exclude: Iterable[str] = tuple() + self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() ) -> "EntityRuler": """Load the entity ruler from a file. Expects a file containing newline-delimited JSON (JSONL) with one entry per line. @@ -391,7 +391,9 @@ class EntityRuler: from_disk(path, deserializers_patterns, {}) return self - def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = tuple()) -> None: + def to_disk( + self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() + ) -> None: """Save the entity ruler patterns to a directory. The patterns will be saved as newline-delimited JSON (JSONL). diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 2f2601f3f..4cf1580d3 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -230,7 +230,7 @@ class Morphologizer(Tagger): "morph", **kwargs)) return results - def to_bytes(self, exclude=tuple()): + def to_bytes(self, *, exclude=tuple()): """Serialize the pipe to a bytestring. exclude (Iterable[str]): String names of serialization fields to exclude. @@ -244,7 +244,7 @@ class Morphologizer(Tagger): serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) return util.to_bytes(serialize, exclude) - def from_bytes(self, bytes_data, exclude=tuple()): + def from_bytes(self, bytes_data, *, exclude=tuple()): """Load the pipe from a bytestring. bytes_data (bytes): The serialized pipe. @@ -267,7 +267,7 @@ class Morphologizer(Tagger): util.from_bytes(bytes_data, deserialize, exclude) return self - def to_disk(self, path, exclude=tuple()): + def to_disk(self, path, *, exclude=tuple()): """Serialize the pipe to disk. path (str / Path): Path to a directory. @@ -282,7 +282,7 @@ class Morphologizer(Tagger): } util.to_disk(path, serialize, exclude) - def from_disk(self, path, exclude=tuple()): + def from_disk(self, path, *, exclude=tuple()): """Load the pipe from disk. Modifies the object in place and returns it. path (str / Path): Path to a directory. diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index ab80aa32e..c378b42f7 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -180,7 +180,7 @@ class Pipe: """ return {} - def to_bytes(self, exclude=tuple()): + def to_bytes(self, *, exclude=tuple()): """Serialize the pipe to a bytestring. exclude (Iterable[str]): String names of serialization fields to exclude. @@ -195,7 +195,7 @@ class Pipe: serialize["vocab"] = self.vocab.to_bytes return util.to_bytes(serialize, exclude) - def from_bytes(self, bytes_data, exclude=tuple()): + def from_bytes(self, bytes_data, *, exclude=tuple()): """Load the pipe from a bytestring. exclude (Iterable[str]): String names of serialization fields to exclude. @@ -218,7 +218,7 @@ class Pipe: util.from_bytes(bytes_data, deserialize, exclude) return self - def to_disk(self, path, exclude=tuple()): + def to_disk(self, path, *, exclude=tuple()): """Serialize the pipe to disk. path (str / Path): Path to a directory. @@ -232,7 +232,7 @@ class Pipe: serialize["model"] = lambda p: self.model.to_disk(p) util.to_disk(path, serialize, exclude) - def from_disk(self, path, exclude=tuple()): + def from_disk(self, path, *, exclude=tuple()): """Load the pipe from disk. path (str / Path): Path to a directory. diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx index 8203249d7..31208ea2c 100644 --- a/spacy/pipeline/sentencizer.pyx +++ b/spacy/pipeline/sentencizer.pyx @@ -162,7 +162,7 @@ class Sentencizer(Pipe): del results["sents_per_type"] return results - def to_bytes(self, exclude=tuple()): + def to_bytes(self, *, exclude=tuple()): """Serialize the sentencizer to a bytestring. RETURNS (bytes): The serialized object. @@ -171,7 +171,7 @@ class Sentencizer(Pipe): """ return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars)}) - def from_bytes(self, bytes_data, exclude=tuple()): + def from_bytes(self, bytes_data, *, exclude=tuple()): """Load the sentencizer from a bytestring. bytes_data (bytes): The data to load. @@ -183,7 +183,7 @@ class Sentencizer(Pipe): self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars)) return self - def to_disk(self, path, exclude=tuple()): + def to_disk(self, path, *, exclude=tuple()): """Serialize the sentencizer to disk. DOCS: https://spacy.io/api/sentencizer#to_disk @@ -193,7 +193,7 @@ class Sentencizer(Pipe): srsly.write_json(path, {"punct_chars": list(self.punct_chars)}) - def from_disk(self, path, exclude=tuple()): + def from_disk(self, path, *, exclude=tuple()): """Load the sentencizer from disk. DOCS: https://spacy.io/api/sentencizer#from_disk diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 58f6f04b6..e09805e33 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -157,7 +157,7 @@ class SentenceRecognizer(Tagger): del results["sents_per_type"] return results - def to_bytes(self, exclude=tuple()): + def to_bytes(self, *, exclude=tuple()): """Serialize the pipe to a bytestring. exclude (Iterable[str]): String names of serialization fields to exclude. @@ -171,7 +171,7 @@ class SentenceRecognizer(Tagger): serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) return util.to_bytes(serialize, exclude) - def from_bytes(self, bytes_data, exclude=tuple()): + def from_bytes(self, bytes_data, *, exclude=tuple()): """Load the pipe from a bytestring. bytes_data (bytes): The serialized pipe. @@ -194,7 +194,7 @@ class SentenceRecognizer(Tagger): util.from_bytes(bytes_data, deserialize, exclude) return self - def to_disk(self, path, exclude=tuple()): + def to_disk(self, path, *, exclude=tuple()): """Serialize the pipe to disk. path (str / Path): Path to a directory. @@ -209,7 +209,7 @@ class SentenceRecognizer(Tagger): } util.to_disk(path, serialize, exclude) - def from_disk(self, path, exclude=tuple()): + def from_disk(self, path, *, exclude=tuple()): """Load the pipe from disk. Modifies the object in place and returns it. path (str / Path): Path to a directory. diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index c96f9c029..28c46d1cd 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -370,7 +370,7 @@ class Tagger(Pipe): scores.update(Scorer.score_token_attr(examples, "lemma", **kwargs)) return scores - def to_bytes(self, exclude=tuple()): + def to_bytes(self, *, exclude=tuple()): """Serialize the pipe to a bytestring. exclude (Iterable[str]): String names of serialization fields to exclude. @@ -388,7 +388,7 @@ class Tagger(Pipe): serialize["morph_rules"] = lambda: srsly.msgpack_dumps(morph_rules) return util.to_bytes(serialize, exclude) - def from_bytes(self, bytes_data, exclude=tuple()): + def from_bytes(self, bytes_data, *, exclude=tuple()): """Load the pipe from a bytestring. bytes_data (bytes): The serialized pipe. @@ -424,7 +424,7 @@ class Tagger(Pipe): util.from_bytes(bytes_data, deserialize, exclude) return self - def to_disk(self, path, exclude=tuple()): + def to_disk(self, path, *, exclude=tuple()): """Serialize the pipe to disk. path (str / Path): Path to a directory. @@ -443,7 +443,7 @@ class Tagger(Pipe): } util.to_disk(path, serialize, exclude) - def from_disk(self, path, exclude=tuple()): + def from_disk(self, path, *, exclude=tuple()): """Load the pipe from disk. Modifies the object in place and returns it. path (str / Path): Path to a directory. diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 858a93ce5..bffbf5829 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -728,7 +728,7 @@ cdef class Tokenizer: with path.open("wb") as file_: file_.write(self.to_bytes(**kwargs)) - def from_disk(self, path, **kwargs): + def from_disk(self, path, *, exclude=tuple()): """Loads state from a directory. Modifies the object in place and returns it. @@ -741,10 +741,10 @@ cdef class Tokenizer: path = util.ensure_path(path) with path.open("rb") as file_: bytes_data = file_.read() - self.from_bytes(bytes_data, **kwargs) + self.from_bytes(bytes_data, exclude=exclude) return self - def to_bytes(self, exclude=tuple()): + def to_bytes(self, *, exclude=tuple()): """Serialize the current state to a binary string. exclude (list): String names of serialization fields to exclude. @@ -763,7 +763,7 @@ cdef class Tokenizer: } return util.to_bytes(serializers, exclude) - def from_bytes(self, bytes_data, exclude=tuple()): + def from_bytes(self, bytes_data, *, exclude=tuple()): """Load state from a binary string. bytes_data (bytes): The data to load from. diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 0ba5abb52..2fcc0983b 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -987,20 +987,20 @@ cdef class Doc: other.c = &tokens[PADDING] return other - def to_disk(self, path, **kwargs): + def to_disk(self, path, *, exclude=tuple()): """Save the current state to a directory. path (str / Path): A path to a directory, which will be created if it doesn't exist. Paths may be either strings or Path-like objects. - exclude (list): String names of serialization fields to exclude. + exclude (Iterable[str]): String names of serialization fields to exclude. DOCS: https://spacy.io/api/doc#to_disk """ path = util.ensure_path(path) with path.open("wb") as file_: - file_.write(self.to_bytes(**kwargs)) + file_.write(self.to_bytes(exclude=exclude)) - def from_disk(self, path, **kwargs): + def from_disk(self, path, *, exclude=tuple()): """Loads state from a directory. Modifies the object in place and returns it. @@ -1014,9 +1014,9 @@ cdef class Doc: path = util.ensure_path(path) with path.open("rb") as file_: bytes_data = file_.read() - return self.from_bytes(bytes_data, **kwargs) + return self.from_bytes(bytes_data, exclude=exclude) - def to_bytes(self, exclude=tuple(), **kwargs): + def to_bytes(self, *, exclude=tuple()): """Serialize, i.e. export the document contents to a binary string. exclude (list): String names of serialization fields to exclude. @@ -1025,9 +1025,9 @@ cdef class Doc: DOCS: https://spacy.io/api/doc#to_bytes """ - return srsly.msgpack_dumps(self.to_dict(exclude=exclude, **kwargs)) + return srsly.msgpack_dumps(self.to_dict(exclude=exclude)) - def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): + def from_bytes(self, bytes_data, *, exclude=tuple()): """Deserialize, i.e. import the document contents from a binary string. data (bytes): The string to load from. @@ -1036,13 +1036,9 @@ cdef class Doc: DOCS: https://spacy.io/api/doc#from_bytes """ - return self.from_dict( - srsly.msgpack_loads(bytes_data), - exclude=exclude, - **kwargs - ) + return self.from_dict(srsly.msgpack_loads(bytes_data), exclude=exclude) - def to_dict(self, exclude=tuple(), **kwargs): + def to_dict(self, *, exclude=tuple()): """Export the document contents to a dictionary for serialization. exclude (list): String names of serialization fields to exclude. @@ -1090,14 +1086,14 @@ cdef class Doc: serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values) return util.to_dict(serializers, exclude) - def from_dict(self, msg, exclude=tuple(), **kwargs): + def from_dict(self, msg, *, exclude=tuple()): """Deserialize, i.e. import the document contents from a binary string. data (bytes): The string to load from. exclude (list): String names of serialization fields to exclude. RETURNS (Doc): Itself. - DOCS: https://spacy.io/api/doc#from_bytes + DOCS: https://spacy.io/api/doc#from_dict """ if self.length != 0: raise ValueError(Errors.E033.format(length=self.length)) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index f41ad2356..7713ec528 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -439,7 +439,7 @@ cdef class Vocab: orth = self.strings.add(orth) return orth in self.vectors - def to_disk(self, path, exclude=tuple()): + def to_disk(self, path, *, exclude=tuple()): """Save the current state to a directory. path (unicode or Path): A path to a directory, which will be created if @@ -459,7 +459,7 @@ cdef class Vocab: if "lookups" not in "exclude" and self.lookups is not None: self.lookups.to_disk(path) - def from_disk(self, path, exclude=tuple()): + def from_disk(self, path, *, exclude=tuple()): """Loads state from a directory. Modifies the object in place and returns it. @@ -488,7 +488,7 @@ cdef class Vocab: self._by_orth = PreshMap() return self - def to_bytes(self, exclude=tuple()): + def to_bytes(self, *, exclude=tuple()): """Serialize the current state to a binary string. exclude (list): String names of serialization fields to exclude. @@ -509,7 +509,7 @@ cdef class Vocab: } return util.to_bytes(getters, exclude) - def from_bytes(self, bytes_data, exclude=tuple()): + def from_bytes(self, bytes_data, *, exclude=tuple()): """Load state from a binary string. bytes_data (bytes): The data to load from. diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md index f6ed7492d..a18e9e582 100644 --- a/website/docs/api/dependencyparser.md +++ b/website/docs/api/dependencyparser.md @@ -290,10 +290,11 @@ Serialize the pipe to disk. > parser.to_disk("/path/to/parser") > ``` -| Name | Type | Description | -| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## DependencyParser.from_disk {#from_disk tag="method"} @@ -306,11 +307,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > parser.from_disk("/path/to/parser") > ``` -| Name | Type | Description | -| ----------- | ------------------ | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `DependencyParser` | The modified `DependencyParser` object. | +| Name | Type | Description | +| -------------- | ------------------ | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `DependencyParser` | The modified `DependencyParser` object. | ## DependencyParser.to_bytes {#to_bytes tag="method"} @@ -323,10 +325,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| ----------- | --------------- | ------------------------------------------------------------------------- | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `DependencyParser` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `DependencyParser` object. | ## DependencyParser.from_bytes {#from_bytes tag="method"} @@ -340,11 +343,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > parser.from_bytes(parser_bytes) > ``` -| Name | Type | Description | -| ------------ | ------------------ | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `DependencyParser` | The `DependencyParser` object. | +| Name | Type | Description | +| -------------- | ------------------ | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `DependencyParser` | The `DependencyParser` object. | ## DependencyParser.labels {#labels tag="property"} diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 69608c958..a9499f6d4 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -385,10 +385,11 @@ Save the current state to a directory. > doc.to_disk("/path/to/doc") > ``` -| Name | Type | Description | -| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## Doc.from_disk {#from_disk tag="method" new="2"} @@ -402,11 +403,12 @@ Loads state from a directory. Modifies the object in place and returns it. > doc = Doc(Vocab()).from_disk("/path/to/doc") > ``` -| Name | Type | Description | -| ----------- | ------------ | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Doc` | The modified `Doc` object. | +| Name | Type | Description | +| -------------- | --------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Doc` | The modified `Doc` object. | ## Doc.to_bytes {#to_bytes tag="method"} @@ -419,10 +421,11 @@ Serialize, i.e. export the document contents to a binary string. > doc_bytes = doc.to_bytes() > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------------------------------------------------------- | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | A losslessly serialized copy of the `Doc`, including all annotations. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | A losslessly serialized copy of the `Doc`, including all annotations. | ## Doc.from_bytes {#from_bytes tag="method"} @@ -438,11 +441,12 @@ Deserialize, i.e. import the document contents from a binary string. > assert doc.text == doc2.text > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------------------------------------------------------- | -| `data` | bytes | The string to load from. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Doc` | The `Doc` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| `data` | bytes | The string to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Doc` | The `Doc` object. | ## Doc.retokenize {#retokenize tag="contextmanager" new="2.1"} diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index c29f0326c..2a1ba94d2 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -265,10 +265,11 @@ Serialize the pipe to disk. > entity_linker.to_disk("/path/to/entity_linker") > ``` -| Name | Type | Description | -| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## EntityLinker.from_disk {#from_disk tag="method"} @@ -281,11 +282,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > entity_linker.from_disk("/path/to/entity_linker") > ``` -| Name | Type | Description | -| ----------- | --------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. | +| Name | Type | Description | +| -------------- | --------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index b1d40a9c3..b5b549a04 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -289,10 +289,11 @@ Serialize the pipe to disk. > ner.to_disk("/path/to/ner") > ``` -| Name | Type | Description | -| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## EntityRecognizer.from_disk {#from_disk tag="method"} @@ -305,11 +306,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > ner.from_disk("/path/to/ner") > ``` -| Name | Type | Description | -| ----------- | ------------------ | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `EntityRecognizer` | The modified `EntityRecognizer` object. | +| Name | Type | Description | +| -------------- | ------------------ | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `EntityRecognizer` | The modified `EntityRecognizer` object. | ## EntityRecognizer.to_bytes {#to_bytes tag="method"} @@ -322,10 +324,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| ----------- | --------------- | ------------------------------------------------------------------------- | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `EntityRecognizer` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `EntityRecognizer` object. | ## EntityRecognizer.from_bytes {#from_bytes tag="method"} @@ -339,11 +342,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > ner.from_bytes(ner_bytes) > ``` -| Name | Type | Description | -| ------------ | ------------------ | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `EntityRecognizer` | The `EntityRecognizer` object. | +| Name | Type | Description | +| -------------- | ------------------ | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `EntityRecognizer` | The `EntityRecognizer` object. | ## EntityRecognizer.labels {#labels tag="property"} diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 0f7797d7f..7e25106d1 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -645,10 +645,11 @@ the model**. > nlp.to_disk("/path/to/models") > ``` -| Name | Type | Description | -| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | ## Language.from_disk {#from_disk tag="method" new="2"} @@ -670,11 +671,12 @@ loaded object. > nlp = English().from_disk("/path/to/en_model") > ``` -| Name | Type | Description | -| ----------- | ------------ | ----------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Language` | The modified `Language` object. | +| Name | Type | Description | +| -------------- | --------------- | ----------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Language` | The modified `Language` object. | ## Language.to_bytes {#to_bytes tag="method"} @@ -686,10 +688,11 @@ Serialize the current state to a binary string. > nlp_bytes = nlp.to_bytes() > ``` -| Name | Type | Description | -| ----------- | ----- | ----------------------------------------------------------------------------------------- | -| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `Language` object. | +| Name | Type | Description | +| -------------- | --------------- | ----------------------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `Language` object. | ## Language.from_bytes {#from_bytes tag="method"} @@ -707,11 +710,12 @@ available to the loaded object. > nlp2.from_bytes(nlp_bytes) > ``` -| Name | Type | Description | -| ------------ | ---------- | ----------------------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Language` | The `Language` object. | +| Name | Type | Description | +| -------------- | --------------- | ----------------------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Language` | The `Language` object. | ## Attributes {#attributes} diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md index a153bd51c..ac7146543 100644 --- a/website/docs/api/morphologizer.md +++ b/website/docs/api/morphologizer.md @@ -276,10 +276,11 @@ Serialize the pipe to disk. > morphologizer.to_disk("/path/to/morphologizer") > ``` -| Name | Type | Description | -| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## Morphologizer.from_disk {#from_disk tag="method"} @@ -292,11 +293,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > morphologizer.from_disk("/path/to/morphologizer") > ``` -| Name | Type | Description | -| ----------- | --------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Morphologizer` | The modified `Morphologizer` object. | +| Name | Type | Description | +| -------------- | --------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Morphologizer` | The modified `Morphologizer` object. | ## Morphologizer.to_bytes {#to_bytes tag="method"} @@ -309,10 +311,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| ----------- | --------------- | ------------------------------------------------------------------------- | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `Morphologizer` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `Morphologizer` object. | ## Morphologizer.from_bytes {#from_bytes tag="method"} @@ -326,11 +329,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > morphologizer.from_bytes(morphologizer_bytes) > ``` -| Name | Type | Description | -| ------------ | --------------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Morphologizer` | The `Morphologizer` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Morphologizer` | The `Morphologizer` object. | ## Morphologizer.labels {#labels tag="property"} diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md index a2d055d88..99d06c79f 100644 --- a/website/docs/api/pipe.md +++ b/website/docs/api/pipe.md @@ -306,10 +306,11 @@ Serialize the pipe to disk. > pipe.to_disk("/path/to/pipe") > ``` -| Name | Type | Description | -| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## Pipe.from_disk {#from_disk tag="method"} @@ -322,11 +323,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > pipe.from_disk("/path/to/pipe") > ``` -| Name | Type | Description | -| ----------- | --------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Pipe` | The modified pipe. | +| Name | Type | Description | +| -------------- | --------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Pipe` | The modified pipe. | ## Pipe.to_bytes {#to_bytes tag="method"} @@ -339,10 +341,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| ----------- | --------------- | ------------------------------------------------------------------------- | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the pipe. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the pipe. | ## Pipe.from_bytes {#from_bytes tag="method"} @@ -356,11 +359,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > pipe.from_bytes(pipe_bytes) > ``` -| Name | Type | Description | -| ------------ | --------------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Pipe` | The pipe. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Pipe` | The pipe. | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/sentencerecognizer.md b/website/docs/api/sentencerecognizer.md index f7d2ac00f..fdc950bb0 100644 --- a/website/docs/api/sentencerecognizer.md +++ b/website/docs/api/sentencerecognizer.md @@ -291,10 +291,11 @@ Serialize the pipe to disk. > senter.to_disk("/path/to/senter") > ``` -| Name | Type | Description | -| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## SentenceRecognizer.from_disk {#from_disk tag="method"} @@ -307,11 +308,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > senter.from_disk("/path/to/senter") > ``` -| Name | Type | Description | -| ----------- | -------------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `SentenceRecognizer` | The modified `SentenceRecognizer` object. | +| Name | Type | Description | +| -------------- | -------------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `SentenceRecognizer` | The modified `SentenceRecognizer` object. | ## SentenceRecognizer.to_bytes {#to_bytes tag="method"} @@ -324,10 +326,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| ----------- | --------------- | ------------------------------------------------------------------------- | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `SentenceRecognizer` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `SentenceRecognizer` object. | ## SentenceRecognizer.from_bytes {#from_bytes tag="method"} @@ -341,11 +344,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > senter.from_bytes(senter_bytes) > ``` -| Name | Type | Description | -| ------------ | -------------------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `SentenceRecognizer` | The `SentenceRecognizer` object. | +| Name | Type | Description | +| -------------- | -------------------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `SentenceRecognizer` | The `SentenceRecognizer` object. | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index cc7401016..37ef13453 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -307,10 +307,11 @@ Serialize the pipe to disk. > tagger.to_disk("/path/to/tagger") > ``` -| Name | Type | Description | -| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## Tagger.from_disk {#from_disk tag="method"} @@ -323,11 +324,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > tagger.from_disk("/path/to/tagger") > ``` -| Name | Type | Description | -| ----------- | --------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Tagger` | The modified `Tagger` object. | +| Name | Type | Description | +| -------------- | --------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Tagger` | The modified `Tagger` object. | ## Tagger.to_bytes {#to_bytes tag="method"} @@ -340,10 +342,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| ----------- | --------------- | ------------------------------------------------------------------------- | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `Tagger` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `Tagger` object. | ## Tagger.from_bytes {#from_bytes tag="method"} @@ -357,11 +360,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > tagger.from_bytes(tagger_bytes) > ``` -| Name | Type | Description | -| ------------ | --------------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Tagger` | The `Tagger` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Tagger` | The `Tagger` object. | ## Tagger.labels {#labels tag="property"} diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index c0dd07c1e..1efd5831c 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -325,10 +325,11 @@ Serialize the pipe to disk. > textcat.to_disk("/path/to/textcat") > ``` -| Name | Type | Description | -| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## TextCategorizer.from_disk {#from_disk tag="method"} @@ -341,11 +342,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > textcat.from_disk("/path/to/textcat") > ``` -| Name | Type | Description | -| ----------- | ----------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `TextCategorizer` | The modified `TextCategorizer` object. | +| Name | Type | Description | +| -------------- | ----------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `TextCategorizer` | The modified `TextCategorizer` object. | ## TextCategorizer.to_bytes {#to_bytes tag="method"} @@ -358,10 +360,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| ----------- | --------------- | ------------------------------------------------------------------------- | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `TextCategorizer` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `TextCategorizer` object. | ## TextCategorizer.from_bytes {#from_bytes tag="method"} @@ -375,11 +378,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > textcat.from_bytes(textcat_bytes) > ``` -| Name | Type | Description | -| ------------ | ----------------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `TextCategorizer` | The `TextCategorizer` object. | +| Name | Type | Description | +| -------------- | ----------------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `TextCategorizer` | The `TextCategorizer` object. | ## TextCategorizer.labels {#labels tag="property"} diff --git a/website/docs/api/tok2vec.md b/website/docs/api/tok2vec.md index 11167c428..f810793ce 100644 --- a/website/docs/api/tok2vec.md +++ b/website/docs/api/tok2vec.md @@ -227,10 +227,11 @@ Serialize the pipe to disk. > tok2vec.to_disk("/path/to/tok2vec") > ``` -| Name | Type | Description | -| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## Tok2Vec.from_disk {#from_disk tag="method"} @@ -243,11 +244,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > tok2vec.from_disk("/path/to/tok2vec") > ``` -| Name | Type | Description | -| ----------- | --------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Tok2Vec` | The modified `Tok2Vec` object. | +| Name | Type | Description | +| -------------- | --------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Tok2Vec` | The modified `Tok2Vec` object. | ## Tok2Vec.to_bytes {#to_bytes tag="method"} @@ -260,10 +262,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| ----------- | --------------- | ------------------------------------------------------------------------- | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `Tok2Vec` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `Tok2Vec` object. | ## Tok2Vec.from_bytes {#from_bytes tag="method"} @@ -277,11 +280,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > tok2vec.from_bytes(tok2vec_bytes) > ``` -| Name | Type | Description | -| ------------ | --------------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Tok2Vec` | The `Tok2Vec` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Tok2Vec` | The `Tok2Vec` object. | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md index 02023cf9f..23b6e4f3f 100644 --- a/website/docs/api/tokenizer.md +++ b/website/docs/api/tokenizer.md @@ -158,10 +158,11 @@ Serialize the tokenizer to disk. > tokenizer.to_disk("/path/to/tokenizer") > ``` -| Name | Type | Description | -| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## Tokenizer.from_disk {#from_disk tag="method"} @@ -174,11 +175,12 @@ Load the tokenizer from disk. Modifies the object in place and returns it. > tokenizer.from_disk("/path/to/tokenizer") > ``` -| Name | Type | Description | -| ----------- | ------------ | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Tokenizer` | The modified `Tokenizer` object. | +| Name | Type | Description | +| -------------- | --------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Tokenizer` | The modified `Tokenizer` object. | ## Tokenizer.to_bytes {#to_bytes tag="method"} @@ -191,10 +193,11 @@ Load the tokenizer from disk. Modifies the object in place and returns it. Serialize the tokenizer to a bytestring. -| Name | Type | Description | -| ----------- | ----- | ------------------------------------------------------------------------- | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `Tokenizer` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `Tokenizer` object. | ## Tokenizer.from_bytes {#from_bytes tag="method"} @@ -209,11 +212,12 @@ it. > tokenizer.from_bytes(tokenizer_bytes) > ``` -| Name | Type | Description | -| ------------ | ----------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Tokenizer` | The `Tokenizer` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Tokenizer` | The `Tokenizer` object. | ## Attributes {#attributes} diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md index aab02fe68..e89ecb6b7 100644 --- a/website/docs/api/transformer.md +++ b/website/docs/api/transformer.md @@ -51,11 +51,11 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("transformer", config=DEFAULT_CONFIG) > ``` -| Setting | Type | Description | Default | -| ------------------- | ------------------------------------------ | ------------------------------- | ------------------------------------------------------------------- | -| `max_batch_items` | int | Maximum size of a padded batch. | `4096` | -| `annotation_setter` | Callable | | [`null_annotation_setter`](/api/transformer#null_annotation_setter) | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransformerModel](/api/architectures#TransformerModel) | +| Setting | Type | Description | Default | +| ------------------- | ------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- | +| `max_batch_items` | int | Maximum size of a padded batch. | `4096` | +| `annotation_setter` | Callable | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](#fulltransformerbatch) and can set additional annotations on the `Doc`. | `null_annotation_setter` | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransformerModel](/api/architectures#TransformerModel) | ```python https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py @@ -69,8 +69,14 @@ https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/p > # Construction via add_pipe with default model > trf = nlp.add_pipe("transformer") > -> # Construction via add_pipe with custom model -> config = {"model": {"@architectures": "my_transformer"}} +> # Construction via add_pipe with custom config +> config = { +> "model": { +> "@architectures": "spacy-transformers.TransformerModel.v1", +> "name": "bert-base-uncased", +> "tokenizer_config": {"use_fast": True} +> } +> } > trf = nlp.add_pipe("transformer", config=config) > > # Construction from class @@ -82,26 +88,313 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#create_pipe). -| Name | Type | Description | -| ------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The shared vocabulary. | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | -| `annotation_setter` | `Callable` | | -| _keyword-only_ | | | -| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | -| `max_batch_items` | int | Maximum size of a padded batch. Defaults to `128*32`. | +| Name | Type | Description | +| ------------------- | ------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The shared vocabulary. | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | +| `annotation_setter` | `Callable` | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. Defaults to `null_annotation_setter`, a function that does nothing. | +| _keyword-only_ | | | +| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | +| `max_batch_items` | int | Maximum size of a padded batch. Defaults to `128*32`. | - +## Transformer.\_\_call\_\_ {#call tag="method"} + +Apply the pipe to one document. The document is modified in place, and returned. +This usually happens under the hood when the `nlp` object is called on a text +and all pipeline components are applied to the `Doc` in order. Both +[`__call__`](/api/transformer#call) and [`pipe`](/api/transformer#pipe) delegate +to the [`predict`](/api/transformer#predict) and +[`set_annotations`](/api/transformer#set_annotations) methods. + +> #### Example +> +> ```python +> doc = nlp("This is a sentence.") +> trf = nlp.add_pipe("transformer") +> # This usually happens under the hood +> processed = transformer(doc) +> ``` + +| Name | Type | Description | +| ----------- | ----- | ------------------------ | +| `doc` | `Doc` | The document to process. | +| **RETURNS** | `Doc` | The processed document. | + +## Transformer.pipe {#pipe tag="method"} + +Apply the pipe to a stream of documents. This usually happens under the hood +when the `nlp` object is called on a text and all pipeline components are +applied to the `Doc` in order. Both [`__call__`](/api/transformer#call) and +[`pipe`](/api/transformer#pipe) delegate to the +[`predict`](/api/transformer#predict) and +[`set_annotations`](/api/transformer#set_annotations) methods. + +> #### Example +> +> ```python +> trf = nlp.add_pipe("transformer") +> for doc in trf.pipe(docs, batch_size=50): +> pass +> ``` + +| Name | Type | Description | +| -------------- | --------------- | ----------------------------------------------------- | +| `stream` | `Iterable[Doc]` | A stream of documents. | +| _keyword-only_ | | | +| `batch_size` | int | The number of documents to buffer. Defaults to `128`. | +| **YIELDS** | `Doc` | The processed documents in order. | + +## Transformer.begin_training {#begin_training tag="method"} + +Initialize the pipe for training, using data examples if available. Returns an +[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. + +> #### Example +> +> ```python +> trf = nlp.add_pipe("transformer") +> optimizer = trf.begin_training(pipeline=nlp.pipeline) +> ``` + +| Name | Type | Description | +| -------------- | --------------------------------------------------- | -------------------------------------------------------------------------------------------------------------- | +| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. | +| _keyword-only_ | | | +| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. | +| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/transformer#create_optimizer) if not set. | +| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | + +## Transformer.predict {#predict tag="method"} + +Apply the pipeline's model to a batch of docs, without modifying them. + +> #### Example +> +> ```python +> trf = nlp.add_pipe("transformer") +> scores = trf.predict([doc1, doc2]) +> ``` + +| Name | Type | Description | +| ----------- | --------------- | ----------------------------------------- | +| `docs` | `Iterable[Doc]` | The documents to predict. | +| **RETURNS** | - | The model's prediction for each document. | + +## Transformer.set_annotations {#set_annotations tag="method"} + +Modify a batch of documents, using pre-computed scores. + +> #### Example +> +> ```python +> trf = nlp.add_pipe("transformer") +> scores = trf.predict(docs) +> trf.set_annotations(docs, scores) +> ``` + +| Name | Type | Description | +| -------- | --------------- | ----------------------------------------------------- | +| `docs` | `Iterable[Doc]` | The documents to modify. | +| `scores` | - | The scores to set, produced by `Transformer.predict`. | + +## Transformer.update {#update tag="method"} + +Learn from a batch of documents and gold-standard information, updating the +pipe's model. Delegates to [`predict`](/api/transformer#predict). + +> #### Example +> +> ```python +> trf = nlp.add_pipe("transformer") +> optimizer = nlp.begin_training() +> losses = trf.update(examples, sgd=optimizer) +> ``` + +| Name | Type | Description | +| ----------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | +| _keyword-only_ | | | +| `drop` | float | The dropout rate. | +| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/transformer#set_annotations). | +| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | +| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | +| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | + +## Transformer.create_optimizer {#create_optimizer tag="method"} + +Create an optimizer for the pipeline component. + +> #### Example +> +> ```python +> trf = nlp.add_pipe("transformer") +> optimizer = trf.create_optimizer() +> ``` + +| Name | Type | Description | +| ----------- | --------------------------------------------------- | -------------- | +| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | + +## Transformer.use_params {#use_params tag="method, contextmanager"} + +Modify the pipe's model, to use the given parameter values. At the end of the +context, the original parameters are restored. + +> #### Example +> +> ```python +> trf = nlp.add_pipe("transformer") +> with trf.use_params(optimizer.averages): +> trf.to_disk("/best_model") +> ``` + +| Name | Type | Description | +| -------- | ---- | ----------------------------------------- | +| `params` | dict | The parameter values to use in the model. | + +## Transformer.to_disk {#to_disk tag="method"} + +Serialize the pipe to disk. + +> #### Example +> +> ```python +> trf = nlp.add_pipe("transformer") +> trf.to_disk("/path/to/transformer") +> ``` + +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | + +## Transformer.from_disk {#from_disk tag="method"} + +Load the pipe from disk. Modifies the object in place and returns it. + +> #### Example +> +> ```python +> trf = nlp.add_pipe("transformer") +> trf.from_disk("/path/to/transformer") +> ``` + +| Name | Type | Description | +| -------------- | --------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Tok2Vec` | The modified `Tok2Vec` object. | + +## Transformer.to_bytes {#to_bytes tag="method"} + +> #### Example +> +> ```python +> trf = nlp.add_pipe("transformer") +> trf_bytes = trf.to_bytes() +> ``` + +Serialize the pipe to a bytestring. + +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `Tok2Vec` object. | + +## Transformer.from_bytes {#from_bytes tag="method"} + +Load the pipe from a bytestring. Modifies the object in place and returns it. + +> #### Example +> +> ```python +> trf_bytes = trf.to_bytes() +> trf = nlp.add_pipe("transformer") +> trf.from_bytes(trf_bytes) +> ``` + +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Tok2Vec` | The `Tok2Vec` object. | + +## Serialization fields {#serialization-fields} + +During serialization, spaCy will export several data fields used to restore +different aspects of the object. If needed, you can exclude them from +serialization by passing in the string names via the `exclude` argument. + +> #### Example +> +> ```python +> data = trf.to_disk("/path", exclude=["vocab"]) +> ``` + +| Name | Description | +| ------- | -------------------------------------------------------------- | +| `vocab` | The shared [`Vocab`](/api/vocab). | +| `cfg` | The config file. You usually don't want to exclude this. | +| `model` | The binary model data. You usually don't want to exclude this. | ## TransformerData {#transformerdata tag="dataclass"} +Transformer tokens and outputs for one `Doc` object. + +| Name | Type | Description | +| --------- | -------------------------------------------------- | ----------------------------------------- | +| `tokens` | `Dict` | | +| `tensors` | `List[FloatsXd]` | | +| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | | +| `width` | int | | + +### TransformerData.empty {#transformerdata-emoty tag="classmethod"} + + + +| Name | Type | Description | +| ----------- | ----------------- | -------------- | +| **RETURNS** | `TransformerData` | | + ## FullTransformerBatch {#fulltransformerbatch tag="dataclass"} + + +| Name | Type | Description | +| ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------- | +| `spans` | `List[List[Span]]` | | +| `tokens` | [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html?highlight=batchencoding#transformers.BatchEncoding) | | +| `tensors` | `List[torch.Tensor]` | | +| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | | +| `doc_data` | `List[TransformerData]` | | + +### FullTransformerBatch.unsplit_by_doc {#fulltransformerbatch-unsplit_by_doc tag="method"} + + + +| Name | Type | Description | +| ----------- | ---------------------- | -------------- | +| `arrays` | `List[List[Floats3d]]` | | +| **RETURNS** | `FullTransformerBatch` | | + +### FullTransformerBatch.split_by_doc {#fulltransformerbatch-split_by_doc tag="method"} + +Split a `TransformerData` object that represents a batch into a list with one +`TransformerData` per `Doc`. + +| Name | Type | Description | +| ----------- | ----------------------- | -------------- | +| **RETURNS** | `List[TransformerData]` | | + ## Custom attributes {#custom-attributes} The component sets the following [custom extension attributes](/usage/processing-pipeline#custom-components-attributes): -| Name | Type | Description | -| -------------- | ----------------- | -------------- | -| `Doc.trf_data` | `TransformerData` | | +| Name | Type | Description | +| -------------- | ----------------------------------------------------- | ---------------------------------------------------- | +| `Doc.trf_data` | [`TransformerData`](/api/transformer#transformerdata) | Transformer tokens and outputs for the `Doc` object. | diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md index c68af2047..d5c9b0ff0 100644 --- a/website/docs/api/vocab.md +++ b/website/docs/api/vocab.md @@ -230,10 +230,11 @@ Save the current state to a directory. > nlp.vocab.to_disk("/path/to/vocab") > ``` -| Name | Type | Description | -| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## Vocab.from_disk {#from_disk tag="method" new="2"} @@ -246,11 +247,12 @@ Loads state from a directory. Modifies the object in place and returns it. > vocab = Vocab().from_disk("/path/to/vocab") > ``` -| Name | Type | Description | -| ----------- | ------------ | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Vocab` | The modified `Vocab` object. | +| Name | Type | Description | +| -------------- | --------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Vocab` | The modified `Vocab` object. | ## Vocab.to_bytes {#to_bytes tag="method"} @@ -262,10 +264,11 @@ Serialize the current state to a binary string. > vocab_bytes = nlp.vocab.to_bytes() > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------------------------------------------------------- | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `Vocab` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `Vocab` object. | ## Vocab.from_bytes {#from_bytes tag="method"} @@ -280,11 +283,12 @@ Load state from a binary string. > vocab.from_bytes(vocab_bytes) > ``` -| Name | Type | Description | -| ------------ | ------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Vocab` | The `Vocab` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Vocab` | The `Vocab` object. | ## Attributes {#attributes}