diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py index 7c6c76a34..cc6cb98ea 100644 --- a/spacy/cli/debug_model.py +++ b/spacy/cli/debug_model.py @@ -54,7 +54,7 @@ def debug_model_cli( nlp, config = util.load_model_from_config(cfg) except ValueError as e: msg.fail(str(e), exits=1) - seed = config.get("training", {}).get("seed", None) + seed = config["pretraining"]["seed"] if seed is not None: msg.info(f"Fixing random seed: {seed}") fix_random_seed(seed) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 0ea67747e..abe050661 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -221,17 +221,21 @@ config from being resolved. This means that you may not see all validation errors at once and some issues are only shown once previous errors have been fixed. +Instead of specifying all required settings in the config file, you can rely on +an auto-fill functionality that uses spaCy's built-in defaults. The resulting +full config can be written to file and used in downstream training tasks. + ```bash -$ python -m spacy debug config [config_path] [--code] [overrides] +$ python -m spacy debug config [config_path] [--code_path] [--output] [--auto_fill] [--diff] [overrides] ``` -> #### Example +> #### Example 1 > > ```bash > $ python -m spacy debug config ./config.cfg > ``` - + ``` ✘ Config validation error @@ -250,12 +254,30 @@ training -> width extra fields not permitted -| Argument | Type | Description | -| -------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. | -| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. | -| `--help`, `-h` | flag | Show help message and available arguments. | -| overrides | | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`. | +> #### Example 2 +> +> ```bash +> $ python -m spacy debug config ./minimal_config.cfg -F -o ./filled_config.cfg +> ``` + + + +``` +✔ Auto-filled config is valid +✔ Saved updated config to ./filled_config.cfg +``` + + + +| Argument | Type | Default | Description | +| --------------------- | ---------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `config_path` | positional | - | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. | +| `--code_path`, `-c` | option | `None` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. | +| `--auto_fill`, `-F` | option | `False` | Whether or not to auto-fill the config with built-in defaults if possible. If `False`, the provided config needs to be complete. | +| `--output_path`, `-o` | option | `None` | Output path where the filled config can be stored. Use '-' for standard output. | +| `--diff`, `-D` | option | `False` | Show a visual diff if config was auto-filled. | +| `--help`, `-h` | flag | `False` | Show help message and available arguments. | +| overrides | | `None` | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`. | ### debug data {#debug-data} @@ -433,7 +455,135 @@ will not be available. | `--help`, `-h` | flag | Show help message and available arguments. | | overrides | | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`. | - + + +### debug model {#debug-model} + +Debug a Thinc [`Model`](https://thinc.ai/docs/api-model) by running it on a +sample text and checking how it updates its internal weights and parameters. + +```bash +$ python -m spacy debug model [config_path] [component] [--layers] [-DIM] [-PAR] [-GRAD] [-ATTR] [-P0] [-P1] [-P2] [P3] [--gpu_id] +``` + +> #### Example 1 +> +> ```bash +> $ python -m spacy debug model ./config.cfg tagger -P0 +> ``` + + + +``` +ℹ Using CPU +ℹ Fixing random seed: 0 +ℹ Analysing model with ID 62 + +========================== STEP 0 - before training ========================== +ℹ Layer 0: model ID 62: +'extract_features>>list2ragged>>with_array-ints-getitem>>hashembed|ints-getitem>>hashembed|ints-getitem>>hashembed|ints-getitem>>hashembed>>with_array-maxout>>layernorm>>dropout>>ragged2list>>with_array-residual>>residual>>residual>>residual>>with_array-softmax' +ℹ Layer 1: model ID 59: +'extract_features>>list2ragged>>with_array-ints-getitem>>hashembed|ints-getitem>>hashembed|ints-getitem>>hashembed|ints-getitem>>hashembed>>with_array-maxout>>layernorm>>dropout>>ragged2list>>with_array-residual>>residual>>residual>>residual' +ℹ Layer 2: model ID 61: 'with_array-softmax' +ℹ Layer 3: model ID 24: +'extract_features>>list2ragged>>with_array-ints-getitem>>hashembed|ints-getitem>>hashembed|ints-getitem>>hashembed|ints-getitem>>hashembed>>with_array-maxout>>layernorm>>dropout>>ragged2list' +ℹ Layer 4: model ID 58: 'with_array-residual>>residual>>residual>>residual' +ℹ Layer 5: model ID 60: 'softmax' +ℹ Layer 6: model ID 13: 'extract_features' +ℹ Layer 7: model ID 14: 'list2ragged' +ℹ Layer 8: model ID 16: +'with_array-ints-getitem>>hashembed|ints-getitem>>hashembed|ints-getitem>>hashembed|ints-getitem>>hashembed' +ℹ Layer 9: model ID 22: 'with_array-maxout>>layernorm>>dropout' +ℹ Layer 10: model ID 23: 'ragged2list' +ℹ Layer 11: model ID 57: 'residual>>residual>>residual>>residual' +ℹ Layer 12: model ID 15: +'ints-getitem>>hashembed|ints-getitem>>hashembed|ints-getitem>>hashembed|ints-getitem>>hashembed' +ℹ Layer 13: model ID 21: 'maxout>>layernorm>>dropout' +ℹ Layer 14: model ID 32: 'residual' +ℹ Layer 15: model ID 40: 'residual' +ℹ Layer 16: model ID 48: 'residual' +ℹ Layer 17: model ID 56: 'residual' +ℹ Layer 18: model ID 3: 'ints-getitem>>hashembed' +ℹ Layer 19: model ID 6: 'ints-getitem>>hashembed' +ℹ Layer 20: model ID 9: 'ints-getitem>>hashembed' +... +``` + + + +In this example log, we just print the name of each layer after creation of the +model ("Step 0"), which helps us to understand the internal structure of the +Neural Network, and to focus on specific layers that we want to inspect further +(see next example). + +> #### Example 2 +> +> ```bash +> $ python -m spacy debug model ./config.cfg tagger -l "5,15" -DIM -PAR -P0 -P1 -P2 +> ``` + + + +``` +ℹ Using CPU +ℹ Fixing random seed: 0 +ℹ Analysing model with ID 62 + +========================= STEP 0 - before training ========================= +ℹ Layer 5: model ID 60: 'softmax' +ℹ - dim nO: None +ℹ - dim nI: 96 +ℹ - param W: None +ℹ - param b: None +ℹ Layer 15: model ID 40: 'residual' +ℹ - dim nO: None +ℹ - dim nI: None + +======================= STEP 1 - after initialization ======================= +ℹ Layer 5: model ID 60: 'softmax' +ℹ - dim nO: 4 +ℹ - dim nI: 96 +ℹ - param W: (4, 96) - sample: [0. 0. 0. 0. 0.] +ℹ - param b: (4,) - sample: [0. 0. 0. 0.] +ℹ Layer 15: model ID 40: 'residual' +ℹ - dim nO: 96 +ℹ - dim nI: None + +========================== STEP 2 - after training ========================== +ℹ Layer 5: model ID 60: 'softmax' +ℹ - dim nO: 4 +ℹ - dim nI: 96 +ℹ - param W: (4, 96) - sample: [ 0.00283958 -0.00294119 0.00268396 -0.00296219 +-0.00297141] +ℹ - param b: (4,) - sample: [0.00300002 0.00300002 0.00300002 0.00300002] +ℹ Layer 15: model ID 40: 'residual' +ℹ - dim nO: 96 +ℹ - dim nI: None +``` + + + +In this example log, we see how initialization of the model (Step 1) propagates +the correct values for the `nI` (input) and `nO` (output) dimensions of the +various layers. In the `softmax` layer, this step also defines the `W` matrix as +an all-zero matrix determined by the `nO` and `nI` dimensions. After a first +training step (Step 2), this matrix has clearly updated its values through the +training feedback loop. + +| Argument | Type | Default | Description | +| ----------------------- | ---------- | ------- | ---------------------------------------------------------------------------------------------------- | +| `config_path` | positional | | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. | +| `component` | positional | | Name of the pipeline component of which the model should be analysed. | +| `--layers`, `-l` | option | | Comma-separated names of layer IDs to print. | +| `--dimensions`, `-DIM` | option | `False` | Show dimensions of each layer. | +| `--parameters`, `-PAR` | option | `False` | Show parameters of each layer. | +| `--gradients`, `-GRAD` | option | `False` | Show gradients of each layer. | +| `--attributes`, `-ATTR` | option | `False` | Show attributes of each layer. | +| `--print-step0`, `-P0` | option | `False` | Print model before training. | +| `--print-step1`, `-P1` | option | `False` | Print model after initialization. | +| `--print-step2`, `-P2` | option | `False` | Print model after training. | +| `--print-step3`, `-P3` | option | `False` | Print final predictions. | +| `--help`, `-h` | flag | | Show help message and available arguments. | ## Train {#train} diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 10fef6ba6..210e5d47d 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -28,7 +28,7 @@ spaCy's training format. To convert one or more existing `Doc` objects to spaCy's JSON format, you can use the [`gold.docs_to_json`](/api/top-level#docs_to_json) helper. -> #### Annotating entities +> #### Annotating entities {#biluo} > > Named entities are provided in the > [BILUO](/usage/linguistic-features#accessing-ner) notation. Tokens outside an @@ -75,6 +75,123 @@ from the English Wall Street Journal portion of the Penn Treebank: https://github.com/explosion/spaCy/tree/master/examples/training/training-data.json ``` +### Annotations in dictionary format {#dict-input} + +To create [`Example`](/api/example) objects, you can create a dictionary of the +gold-standard annotations `gold_dict`, and then call + +```python +example = Example.from_dict(doc, gold_dict) +``` + +There are currently two formats supported for this dictionary of annotations: +one with a simple, flat structure of keywords, and one with a more hierarchical +structure. + +#### Flat structure {#dict-flat} + +Here is the full overview of potential entries in a flat dictionary of +annotations. You need to only specify those keys corresponding to the task you +want to train. + +```python +### Flat dictionary +{ + "text": string, # Raw text. + "words": List[string], # List of gold tokens. + "lemmas": List[string], # List of lemmas. + "spaces": List[bool], # List of boolean values indicating whether the corresponding tokens is followed by a space or not. + "tags": List[string], # List of fine-grained [POS tags](/usage/linguistic-features#pos-tagging). + "pos": List[string], # List of coarse-grained [POS tags](/usage/linguistic-features#pos-tagging). + "morphs": List[string], # List of [morphological features](/usage/linguistic-features#rule-based-morphology). + "sent_starts": List[bool], # List of boolean values indicating whether each token is the first of a sentence or not. + "deps": List[string], # List of string values indicating the [dependency relation](/usage/linguistic-features#dependency-parse) of a token to its head. + "heads": List[int], # List of integer values indicating the dependency head of each token, referring to the absolute index of each token in the text. + "entities": List[string], # Option 1: List of [BILUO tags](#biluo) per token of the format `"{action}-{label}"`, or `None` for unannotated tokens. + "entities": List[(int, int, string)], # Option 2: List of `"(start, end, label)"` tuples defining all entities in. + "cats": Dict[str, float], # Dictionary of `label:value` pairs indicating how relevant a certain [category](/api/textcategorizer) is for the text. + "links": Dict[(int, int), Dict], # Dictionary of `offset:dict` pairs defining [named entity links](/usage/linguistic-features#entity-linking). The charachter offsets are linked to a dictionary of relevant knowledge base IDs. +} +``` + +There are a few caveats to take into account: + +- Multiple formats are possible for the "entities" entry, but you have to pick + one. +- Any values for sentence starts will be ignored if there are annotations for + dependency relations. +- If the dictionary contains values for "text" and "words", but not "spaces", + the latter are inferred automatically. If "words" is not provided either, the + values are inferred from the `doc` argument. + +##### Examples + +```python +# Training data for a part-of-speech tagger +doc = Doc(vocab, words=["I", "like", "stuff"]) +example = Example.from_dict(doc, {"tags": ["NOUN", "VERB", "NOUN"]}) + +# Training data for an entity recognizer (option 1) +doc = nlp("Laura flew to Silicon Valley.") +biluo_tags = ["U-PERS", "O", "O", "B-LOC", "L-LOC"] +example = Example.from_dict(doc, {"entities": biluo_tags}) + +# Training data for an entity recognizer (option 2) +doc = nlp("Laura flew to Silicon Valley.") +entity_tuples = [ + (0, 5, "PERSON"), + (14, 28, "LOC"), + ] +example = Example.from_dict(doc, {"entities": entity_tuples}) + +# Training data for text categorization +doc = nlp("I'm pretty happy about that!") +example = Example.from_dict(doc, {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}) + +# Training data for an Entity Linking component +doc = nlp("Russ Cochran his reprints include EC Comics.") +example = Example.from_dict(doc, {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}}) +``` + +#### Hierachical structure {#dict-hierarch} + +Internally, a more hierarchical dictionary structure is used to store +gold-standard annotations. Its format is similar to the structure described in +the previous section, but there are two main sections `token_annotation` and +`doc_annotation`, and the keys for token annotations should be uppercase +[`Token` attributes](/api/token#attributes) such as "ORTH" and "TAG". + +```python +### Hierarchical dictionary +{ + "text": string, # Raw text. + "token_annotation": { + "ORTH": List[string], # List of gold tokens. + "LEMMA": List[string], # List of lemmas. + "SPACY": List[bool], # List of boolean values indicating whether the corresponding tokens is followed by a space or not. + "TAG": List[string], # List of fine-grained [POS tags](/usage/linguistic-features#pos-tagging). + "POS": List[string], # List of coarse-grained [POS tags](/usage/linguistic-features#pos-tagging). + "MORPH": List[string], # List of [morphological features](/usage/linguistic-features#rule-based-morphology). + "SENT_START": List[bool], # List of boolean values indicating whether each token is the first of a sentence or not. + "DEP": List[string], # List of string values indicating the [dependency relation](/usage/linguistic-features#dependency-parse) of a token to its head. + "HEAD": List[int], # List of integer values indicating the dependency head of each token, referring to the absolute index of each token in the text. + }, + "doc_annotation": { + "entities": List[(int, int, string)], # List of [BILUO tags](#biluo) per token of the format `"{action}-{label}"`, or `None` for unannotated tokens. + "cats": Dict[str, float], # Dictionary of `label:value` pairs indicating how relevant a certain [category](/api/textcategorizer) is for the text. + "links": Dict[(int, int), Dict], # Dictionary of `offset:dict` pairs defining [named entity links](/usage/linguistic-features#entity-linking). The charachter offsets are linked to a dictionary of relevant knowledge base IDs. + } +} +``` + +There are a few caveats to take into account: + +- Any values for sentence starts will be ignored if there are annotations for + dependency relations. +- If the dictionary contains values for "text" and "ORTH", but not "SPACY", the + latter are inferred automatically. If "ORTH" is not provided either, the + values are inferred from the `doc` argument. + ## Training config {#config new="3"} Config files define the training process and model pipeline and can be passed to diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index 1e9beaf82..18d9c5edd 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -32,15 +32,13 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("entity_linker", config=config) > ``` - - -| Setting | Type | Description | Default | -| ---------------- | ------------------------------------------ | ----------------- | ----------------------------------------------- | -| `kb` | `KnowledgeBase` | | `None` | -| `labels_discard` | `Iterable[str]` | | `[]` | -| `incl_prior` | bool | |  `True` | -| `incl_context` | bool | | `True` | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [EntityLinker](/api/architectures#EntityLinker) | +| Setting | Type | Description | Default | +| ---------------- | ------------------------------------------ | ----------------------------------------------------------------------- | ----------------------------------------------- | +| `kb` | `KnowledgeBase` | The [`KnowledgeBase`](/api/kb) holding all entities and their aliases. | `None` | +| `labels_discard` | `Iterable[str]` | NER labels that will automatically get a "NIL" prediction. | `[]` | +| `incl_prior` | bool | Whether or not to include prior probabilities from the KB in the model. | `True` | +| `incl_context` | bool | Whether or not to include the local context in the model. | `True` | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [EntityLinker](/api/architectures#EntityLinker) | ```python https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entity_linker.py @@ -75,10 +73,10 @@ shortcut for this and instantiate the component using its string name and | `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | | `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | | _keyword-only_ | | | -| `kb` | `KnowlegeBase` | | -| `labels_discard` | `Iterable[str]` | | -| `incl_prior` | bool | | -| `incl_context` | bool | | +| `kb` | `KnowlegeBase` | The [`KnowledgeBase`](/api/kb) holding all entities and their aliases. | +| `labels_discard` | `Iterable[str]` | NER labels that will automatically get a "NIL" prediction. | +| `incl_prior` | bool | Whether or not to include prior probabilities from the KB in the model. | +| `incl_context` | bool | Whether or not to include the local context in the model. | ## EntityLinker.\_\_call\_\_ {#call tag="method"} @@ -130,15 +128,12 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and ## EntityLinker.begin_training {#begin_training tag="method"} Initialize the pipe for training, using data examples if available. Returns an -[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Before calling this -method, a knowledge base should have been defined with -[`set_kb`](/api/entitylinker#set_kb). +[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. > #### Example > > ```python > entity_linker = nlp.add_pipe("entity_linker", last=True) -> entity_linker.set_kb(kb) > optimizer = entity_linker.begin_training(pipeline=nlp.pipeline) > ``` @@ -210,22 +205,6 @@ pipe's entity linking model and context encoder. Delegates to | `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | | **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | -## EntityLinker.set_kb {#set_kb tag="method"} - -Define the knowledge base (KB) used for disambiguating named entities to KB -identifiers. - -> #### Example -> -> ```python -> entity_linker = nlp.add_pipe("entity_linker") -> entity_linker.set_kb(kb) -> ``` - -| Name | Type | Description | -| ---- | --------------- | ------------------------------- | -| `kb` | `KnowledgeBase` | The [`KnowledgeBase`](/api/kb). | - ## EntityLinker.create_optimizer {#create_optimizer tag="method"} Create an optimizer for the pipeline component. diff --git a/website/docs/api/example.md b/website/docs/api/example.md index 1257fdc1e..8c117aec7 100644 --- a/website/docs/api/example.md +++ b/website/docs/api/example.md @@ -8,8 +8,9 @@ new: 3.0 An `Example` holds the information for one training instance. It stores two `Doc` objects: one for holding the gold-standard reference data, and one for -holding the predictions of the pipeline. An `Alignment` object stores the -alignment between these two documents, as they can differ in tokenization. +holding the predictions of the pipeline. An [`Alignment`](#alignment-object) +object stores the alignment between these two documents, as they can differ in +tokenization. ## Example.\_\_init\_\_ {#init tag="method"} @@ -40,9 +41,8 @@ both documents. ## Example.from_dict {#from_dict tag="classmethod"} Construct an `Example` object from the `predicted` document and the reference -annotations provided as a dictionary. - - +annotations provided as a dictionary. For more details on the required format, +see the [training format documentation](/api/data-formats#dict-input). > #### Example > @@ -244,8 +244,9 @@ accuracy of predicted entities against the original gold-standard annotation. ## Example.to_dict {#to_dict tag="method"} -Return a dictionary representation of the reference annotation contained in this -`Example`. +Return a +[hierarchical dictionary representation](/api/data-formats#dict-hierarch) of the +reference annotation contained in this `Example`. > #### Example > @@ -276,3 +277,46 @@ Split one `Example` into multiple `Example` objects, one for each sentence. | Name | Type | Description | | ----------- | --------------- | ---------------------------------------------------------- | | **RETURNS** | `List[Example]` | List of `Example` objects, one for each original sentence. | + +## Alignment {#alignment-object new="3"} + +Calculate alignment tables between two tokenizations. + +### Alignment attributes {#alignment-attributes"} + +| Name | Type | Description | +| ----- | -------------------------------------------------- | ---------------------------------------------------------- | +| `x2y` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | The `Ragged` object holding the alignment from `x` to `y`. | +| `y2x` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | The `Ragged` object holding the alignment from `y` to `x`. | + + + +The current implementation of the alignment algorithm assumes that both +tokenizations add up to the same string. For example, you'll be able to align +`["I", "'", "m"]` and `["I", "'m"]`, which both add up to `"I'm"`, but not +`["I", "'m"]` and `["I", "am"]`. + + + +> #### Example +> +> ```python +> from spacy.gold import Alignment +> +> bert_tokens = ["obama", "'", "s", "podcast"] +> spacy_tokens = ["obama", "'s", "podcast"] +> alignment = Alignment.from_strings(bert_tokens, spacy_tokens) +> a2b = alignment.x2y +> assert list(a2b.dataXd) == [0, 1, 1, 2] +> ``` +> +> If `a2b.dataXd[1] == a2b.dataXd[2] == 1`, that means that `A[1]` (`"'"`) and +> `A[2]` (`"s"`) both align to `B[1]` (`"'s"`). + +### Alignment.from_strings {#classmethod tag="function"} + +| Name | Type | Description | +| ----------- | ----------- | ----------------------------------------------- | +| `A` | list | String values of candidate tokens to align. | +| `B` | list | String values of reference tokens to align. | +| **RETURNS** | `Alignment` | An `Alignment` object describing the alignment. | diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 71b53f844..0954fb577 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -468,59 +468,6 @@ Convert a list of Doc objects into the | `id` | int | ID to assign to the JSON. Defaults to `0`. | | **RETURNS** | dict | The data in spaCy's JSON format. | -### gold.align {#align tag="function"} - -Calculate alignment tables between two tokenizations, using the Levenshtein -algorithm. The alignment is case-insensitive. - - - -The current implementation of the alignment algorithm assumes that both -tokenizations add up to the same string. For example, you'll be able to align -`["I", "'", "m"]` and `["I", "'m"]`, which both add up to `"I'm"`, but not -`["I", "'m"]` and `["I", "am"]`. - - - -> #### Example -> -> ```python -> from spacy.gold import align -> -> bert_tokens = ["obama", "'", "s", "podcast"] -> spacy_tokens = ["obama", "'s", "podcast"] -> alignment = align(bert_tokens, spacy_tokens) -> cost, a2b, b2a, a2b_multi, b2a_multi = alignment -> ``` - -| Name | Type | Description | -| ----------- | ----- | -------------------------------------------------------------------------- | -| `tokens_a` | list | String values of candidate tokens to align. | -| `tokens_b` | list | String values of reference tokens to align. | -| **RETURNS** | tuple | A `(cost, a2b, b2a, a2b_multi, b2a_multi)` tuple describing the alignment. | - -The returned tuple contains the following alignment information: - -> #### Example -> -> ```python -> a2b = array([0, -1, -1, 2]) -> b2a = array([0, 2, 3]) -> a2b_multi = {1: 1, 2: 1} -> b2a_multi = {} -> ``` -> -> If `a2b[3] == 2`, that means that `tokens_a[3]` aligns to `tokens_b[2]`. If -> there's no one-to-one alignment for a token, it has the value `-1`. - -| Name | Type | Description | -| ----------- | -------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | -| `cost` | int | The number of misaligned tokens. | -| `a2b` | `numpy.ndarray[ndim=1, dtype='int32']` | One-to-one mappings of indices in `tokens_a` to indices in `tokens_b`. | -| `b2a` | `numpy.ndarray[ndim=1, dtype='int32']` | One-to-one mappings of indices in `tokens_b` to indices in `tokens_a`. | -| `a2b_multi` | dict | A dictionary mapping indices in `tokens_a` to indices in `tokens_b`, where multiple tokens of `tokens_a` align to the same token of `tokens_b`. | -| `b2a_multi` | dict | A dictionary mapping indices in `tokens_b` to indices in `tokens_a`, where multiple tokens of `tokens_b` align to the same token of `tokens_a`. | - ### gold.biluo_tags_from_offsets {#biluo_tags_from_offsets tag="function"} Encode labelled spans into per-token tags, using the diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 881a0e333..8d3c7e1b6 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -1089,51 +1089,44 @@ In situations like that, you often want to align the tokenization so that you can merge annotations from different sources together, or take vectors predicted by a [pretrained BERT model](https://github.com/huggingface/pytorch-transformers) and -apply them to spaCy tokens. spaCy's [`gold.align`](/api/top-level#align) helper -returns a `(cost, a2b, b2a, a2b_multi, b2a_multi)` tuple describing the number -of misaligned tokens, the one-to-one mappings of token indices in both -directions and the indices where multiple tokens align to one single token. +apply them to spaCy tokens. spaCy's [`Alignment`](/api/example#alignment-object) object +allows the one-to-one mappings of token indices in both directions as well as +taking into account indices where multiple tokens align to one single token. > #### ✏️ Things to try > > 1. Change the capitalization in one of the token lists – for example, > `"obama"` to `"Obama"`. You'll see that the alignment is case-insensitive. > 2. Change `"podcasts"` in `other_tokens` to `"pod", "casts"`. You should see -> that there are now 4 misaligned tokens and that the new many-to-one mapping -> is reflected in `a2b_multi`. -> 3. Make `other_tokens` and `spacy_tokens` identical. You'll see that the -> `cost` is `0` and all corresponding mappings are also identical. +> that there are now two tokens of length 2 in `y2x`, one corresponding to +> "'s", and one to "podcasts". +> 3. Make `other_tokens` and `spacy_tokens` identical. You'll see that all +> tokens now correspond 1-to-1. ```python ### {executable="true"} -from spacy.gold import align +from spacy.gold import Alignment other_tokens = ["i", "listened", "to", "obama", "'", "s", "podcasts", "."] spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."] -cost, a2b, b2a, a2b_multi, b2a_multi = align(other_tokens, spacy_tokens) -print("Edit distance:", cost) # 3 -print("One-to-one mappings a -> b", a2b) # array([0, 1, 2, 3, -1, -1, 5, 6]) -print("One-to-one mappings b -> a", b2a) # array([0, 1, 2, 3, -1, 6, 7]) -print("Many-to-one mappings a -> b", a2b_multi) # {4: 4, 5: 4} -print("Many-to-one mappings b-> a", b2a_multi) # {} +align = Alignment.from_strings(other_tokens, spacy_tokens) +print(f"a -> b, lengths: {align.x2y.lengths}") # array([1, 1, 1, 1, 1, 1, 1, 1]) +print(f"a -> b, mapping: {align.x2y.dataXd}") # array([0, 1, 2, 3, 4, 4, 5, 6]) : two tokens both refer to "'s" +print(f"b -> a, lengths: {align.y2x.lengths}") # array([1, 1, 1, 1, 2, 1, 1]) : the token "'s" refers to two tokens +print(f"b -> a, mappings: {align.y2x.dataXd}") # array([0, 1, 2, 3, 4, 5, 6, 7]) ``` Here are some insights from the alignment information generated in the example above: -- The edit distance (cost) is `3`: two deletions and one insertion. - The one-to-one mappings for the first four tokens are identical, which means they map to each other. This makes sense because they're also identical in the input: `"i"`, `"listened"`, `"to"` and `"obama"`. -- The index mapped to `a2b[6]` is `5`, which means that `other_tokens[6]` +- The value of `x2y.dataXd[6]` is `5`, which means that `other_tokens[6]` (`"podcasts"`) aligns to `spacy_tokens[5]` (also `"podcasts"`). -- `a2b[4]` is `-1`, which means that there is no one-to-one alignment for the - token at `other_tokens[4]`. The token `"'"` doesn't exist on its own in - `spacy_tokens`. The same goes for `a2b[5]` and `other_tokens[5]`, i.e. `"s"`. -- The dictionary `a2b_multi` shows that both tokens 4 and 5 of `other_tokens` - (`"'"` and `"s"`) align to token 4 of `spacy_tokens` (`"'s"`). -- The dictionary `b2a_multi` shows that there are no tokens in `spacy_tokens` - that map to multiple tokens in `other_tokens`. +- `x2y.dataXd[4]` and `x2y.dataXd[5]` are both `4`, which means that both tokens + 4 and 5 of `other_tokens` (`"'"` and `"s"`) align to token 4 of `spacy_tokens` + (`"'s"`).