From 14df00ae989f7332535cb3f74cebed2125aecc91 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 21 Jul 2020 10:33:46 +0200 Subject: [PATCH 1/6] Add Morphology and MorphAnalsysis API docs Add initial draft of `Morphology` and `MorphAnalysis` API docs. --- website/docs/api/morphanalysis.md | 154 ++++++++++++++++++++++++++++ website/docs/api/morphology.md | 165 ++++++++++++++++++++++++++++++ website/meta/sidebars.json | 2 + 3 files changed, 321 insertions(+) create mode 100644 website/docs/api/morphanalysis.md create mode 100644 website/docs/api/morphology.md diff --git a/website/docs/api/morphanalysis.md b/website/docs/api/morphanalysis.md new file mode 100644 index 000000000..7d883c86b --- /dev/null +++ b/website/docs/api/morphanalysis.md @@ -0,0 +1,154 @@ +--- +title: MorphAnalysis +tag: class +source: spacy/tokens/morphanalysis.pyx +--- + +Stores a single morphological analysis. + + +## MorphAnalysis.\_\_init\_\_ {#init tag="method"} + +Initialize a MorphAnalysis object from a UD FEATS string or a dictionary of +morphological features. + +> #### Example +> +> ```python +> from spacy.tokens import MorphAnalysis +> +> feats = "Feat1=Val1|Feat2=Val2" +> m = MorphAnalysis(nlp.vocab, feats) +> ``` + +| Name | Type | Description | +| ----------- | ------------------ | ----------------------------- | +| `vocab` | `Vocab` | The vocab. | +| `features` | `Union[Dict, str]` | The morphological features. | +| **RETURNS** | `MorphAnalysis` | The newly constructed object. | + + +## MorphAnalysis.\_\_contains\_\_ {#contains tag="method"} + +Whether a feature/value pair is in the analysis. + +> #### Example +> +> ```python +> feats = "Feat1=Val1,Val2|Feat2=Val2" +> morph = MorphAnalysis(nlp.vocab, feats) +> assert "Feat1=Val1" in morph +> ``` + +| Name | Type | Description | +| ----------- | ----- | ------------------------------------- | +| **RETURNS** | `str` | A feature/value pair in the analysis. | + + +## MorphAnalysis.\_\_iter\_\_ {#iter tag="method"} + +Iterate over the feature/value pairs in the analysis. + +> #### Example +> +> ```python +> feats = "Feat1=Val1|Feat2=Val2" +> morph = MorphAnalysis(nlp.vocab, feats) +> for feat in morph: +> print(feat) +> ``` + +| Name | Type | Description | +| ---------- | ----- | ------------------------------------- | +| **YIELDS** | `str` | A feature/value pair in the analysis. | + + +## MorphAnalysis.\_\_len\_\_ {#len tag="method"} + +Returns the number of features in the analysis. + +> #### Example +> +> ```python +> feats = "Feat1=Val1,Val2|Feat2=Val2" +> morph = MorphAnalysis(nlp.vocab, feats) +> assert len(morph) == 3 +> ``` + +| Name | Type | Description | +| ----------- | ----- | --------------------------------------- | +| **RETURNS** | `int` | The number of features in the analysis. | + + +## MorphAnalysis.\_\_str\_\_ {#str tag="method"} + +Returns the morphological analysis in the UD FEATS string format. + +> #### Example +> +> ```python +> feats = "Feat1=Val1,Val2|Feat2=Val2" +> morph = MorphAnalysis(nlp.vocab, feats) +> assert str(morph) == feats +> ``` + +| Name | Type | Description | +| ----------- | ----- | ---------------------------------| +| **RETURNS** | `str` | The analysis in UD FEATS format. | + + +## MorphAnalysis.get {#get tag="method"} + +Retrieve a feature by field. + +> #### Example +> +> ```python +> feats = "Feat1=Val1,Val2" +> morph = MorphAnalysis(nlp.vocab, feats) +> assert morph.get("Feat1") == ['Feat1=Val1', 'Feat1=Val2'] +> ``` + +| Name | Type | Description | +| ----------- | ------ | ----------------------------------- | +| `field` | `str` | The field to retrieve. | +| **RETURNS** | `list` | A list of the individual features. | + + +## MorphAnalysis.to_dict {#to_dict tag="method"} + +Produce a dict representation of the analysis, in the same format as the tag +map. + +> #### Example +> +> ```python +> feats = "Feat1=Val1,Val2|Feat2=Val2" +> morph = MorphAnalysis(nlp.vocab, feats) +> assert morph.to_dict() == {'Feat1': 'Val1,Val2', 'Feat2': 'Val2'} +> ``` + +| Name | Type | Description | +| ----------- | ------ | -----------------------------------------| +| **RETURNS** | `dict` | The dict representation of the analysis. | + + +## MorphAnalysis.from_id {#from_id tag="classmethod"} + +Create a morphological analysis from a given hash ID. + +> #### Example +> +> ```python +> feats = "Feat1=Val1|Feat2=Val2" +> hash = nlp.vocab.strings[feats] +> morph = MorphAnalysis.from_id(nlp.vocab, hash) +> assert str(morph) == feats +> ``` + +| Name | Type | Description | +| ------- | ------- | -------------------------------- | +| `vocab` | `Vocab` | The vocab. | +| `key` | `int` | The hash of the features string. | + + diff --git a/website/docs/api/morphology.md b/website/docs/api/morphology.md new file mode 100644 index 000000000..ad279bff7 --- /dev/null +++ b/website/docs/api/morphology.md @@ -0,0 +1,165 @@ +--- +title: Morphology +tag: class +source: spacy/morphology.pyx +--- + +Store the possible morphological analyses for a language, and index them +by hash. To save space on each token, tokens only know the hash of their +morphological analysis, so queries of morphological attributes are delegated to +this class. + + +## Morphology.\_\_init\_\_ {#init tag="method"} + +Create a Morphology object using the tag map, lemmatizer and exceptions. + +> #### Example +> +> ```python +> from spacy.morphology import Morphology +> +> morphology = Morphology(strings, tag_map, lemmatizer) +> ``` + +| Name | Type | Description | +| ----------- | ---------------------------------------- | --------------------------------------------------------------------------------------------------------- | +| `strings` | `StringStore` | The string store. | +| `tag_map` | `Dict[str, Dict]` | The tag map. | +| `lemmatizer`| `Lemmatizer` | The lemmatizer. | +| `exc` | `Dict[str, Dict]` | A dictionary of exceptions in the format `{tag: {orth: {"POS": "X", "Feat1": "Val1, "Feat2": "Val2", ...}` | +| **RETURNS** | `Morphology` | The newly constructed object. | + + +## Morphology.add {#add tag="method"} + +Insert a morphological analysis in the morphology table, if not already +present. The morphological analysis may be provided in the UD FEATS format as a +string or in the tag map dictionary format. Returns the hash of the new +analysis. + +> #### Example +> +> ```python +> feats = "Feat1=Val1|Feat2=Val2" +> hash = nlp.vocab.morphology.add(feats) +> assert hash == nlp.vocab.strings[feats] +> ``` + +| Name | Type | Description | +| ----------- | ------------------- | --------------------------- | +| `features` | `Union[Dict, str]` | The morphological features. | + + +## Morphology.get {#get tag="method"} + +> #### Example +> +> ```python +> feats = "Feat1=Val1|Feat2=Val2" +> hash = nlp.vocab.morphology.add(feats) +> assert nlp.vocab.morphology.get(hash) == feats +> ``` + +Get the FEATS string for the hash of the morphological analysis. + +| Name | Type | Description | +| ----------- | ------ | --------------------------------------- | +| `morph` | int | The hash of the morphological analysis. | + + +## Morphology.load_tag_map {#load_tag_map tag="method"} + +Replace the current tag map with the provided tag map. + +| Name | Type | Description | +| ----------- | ------------------ | ------------ | +| `tag_map` | `Dict[str, Dict]` | The tag map. | + + +## Morphology.load_morph_exceptions {#load_morph_exceptions tag="method"} + +Replace the current morphological exceptions with the provided exceptions. + +| Name | Type | Description | +| ------------- | ------------------ | ----------------------------- | +| `morph_rules` | `Dict[str, Dict]` | The morphological exceptions. | + + +## Morphology.add_special_case {#add_special_case tag="method"} + +Add a special-case rule to the morphological analyzer. Tokens whose tag and +orth match the rule will receive the specified properties. + +> #### Example +> +> ```python +> attrs = {"POS": "DET", "Definite": "Def"} +> morphology.add_special_case("DT", "the", attrs) +> ``` + +| Name | Type | Description | +| ----------- | ---- | ---------------------------------------------- | +| `tag_str` | str | The fine-grained tag. | +| `orth_str` | str | The token text. | +| `attrs` | dict | The features to assign for this token and tag. | + + +## Morphology.exc {#exc tag="property"} + +The current morphological exceptions. + +| Name | Type | Description | +| ---------- | ----- | --------------------------------------------------- | +| **YIELDS** | dict | The current dictionary of morphological exceptions. | + + +## Morphology.lemmatize {#lemmatize tag="method"} + +TODO + + +## Morphology.feats_to_dict {#feats_to_dict tag="staticmethod"} + +Convert a string FEATS representation to a dictionary of features and values in +the same format as the tag map. + +> #### Example +> +> ```python +> from spacy.morphology import Morphology +> d = Morphology.feats_to_dict("Feat1=Val1|Feat2=Val2") +> assert d == {"Feat1": "Val1", "Feat2": "Val2"} +> ``` + +| Name | Type | Description | +| ----------- | ---- | ------------------------------------------------------------- | +| `feats` | str | The morphological features in Universal Dependencies FEATS format. | +| **RETURNS** | dict | The morphological features as a dictionary. | + + +## Morphology.dict_to_feats {#dict_to_feats tag="staticmethod"} + +Convert a dictionary of features and values to a string FEATS representation. + +> #### Example +> +> ```python +> from spacy.morphology import Morphology +> f = Morphology.dict_to_feats({"Feat1": "Val1", "Feat2": "Val2"}) +> assert f == "Feat1=Val1|Feat2=Val2" +> ``` + +| Name | Type | Description | +| ------------ | ----------------- | --------------------------------------------------------------------- | +| `feats_dict` | `Dict[str, Dict]` | The morphological features as a dictionary. | +| **RETURNS** | str | The morphological features as in Universal Dependencies FEATS format. | + + +## Attributes {#attributes} + +| Name | Type | Description | +| ------------- | ----- | -------------------------------------------- | +| `FEATURE_SEP` | `str` | The FEATS feature separator. Default is `|`. | +| `FIELD_SEP` | `str` | The FEATS field separator. Default is `=`. | +| `VALUE_SEP` | `str` | The FEATS value separator. Default is `,`. | diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index 3fed561d0..1357c9d62 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -70,6 +70,7 @@ { "text": "Token", "url": "/api/token" }, { "text": "Span", "url": "/api/span" }, { "text": "Lexeme", "url": "/api/lexeme" }, + { "text": "MorphAnalysis", "url": "/api/morphanalysis" }, { "text": "Example", "url": "/api/example" }, { "text": "DocBin", "url": "/api/docbin" } ] @@ -102,6 +103,7 @@ { "text": "StringStore", "url": "/api/stringstore" }, { "text": "Vectors", "url": "/api/vectors" }, { "text": "Lookups", "url": "/api/lookups" }, + { "text": "Morphology", "url": "/api/morphology" }, { "text": "KnowledgeBase", "url": "/api/kb" }, { "text": "Scorer", "url": "/api/scorer" }, { "text": "Corpus", "url": "/api/corpus" } From fcd3a4abe3a5b0c9d296456f134a7de6fd187edf Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 21 Jul 2020 13:05:58 +0200 Subject: [PATCH 2/6] Add morph to Token API docs --- website/docs/api/token.md | 1 + 1 file changed, 1 insertion(+) diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 549189cad..e60a038ba 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -450,6 +450,7 @@ The L2 norm of the token's vector representation. | `pos_` | str | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | | `tag` | int | Fine-grained part-of-speech. | | `tag_` | str | Fine-grained part-of-speech. | +| `morph` | `MorphAnalysis` | Morphological analysis. | | `dep` | int | Syntactic dependency relation. | | `dep_` | str | Syntactic dependency relation. | | `lang` | int | Language of the parent document's vocabulary. | From d3385f4be254bfb3a36ab8404ede3509e9bc7dc6 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 21 Jul 2020 13:06:22 +0200 Subject: [PATCH 3/6] Add Morphology and MorphAnalysis to overview --- website/docs/usage/101/_architecture.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/website/docs/usage/101/_architecture.md b/website/docs/usage/101/_architecture.md index 95158b67d..d2e1aee39 100644 --- a/website/docs/usage/101/_architecture.md +++ b/website/docs/usage/101/_architecture.md @@ -24,6 +24,7 @@ an **annotated document**. It also orchestrates training and serialization. | [`Span`](/api/span) | A slice from a `Doc` object. | | [`Token`](/api/token) | An individual token — i.e. a word, punctuation symbol, whitespace, etc. | | [`Lexeme`](/api/lexeme) | An entry in the vocabulary. It's a word type with no context, as opposed to a word token. It therefore has no part-of-speech tag, dependency parse etc. | +| [`MorphAnalysis`](/api/morphanalysis) | A morphological analysis. | ### Processing pipeline {#architecture-pipeline} @@ -32,7 +33,7 @@ an **annotated document**. It also orchestrates training and serialization. | [`Language`](/api/language) | A text-processing pipeline. Usually you'll load this once per process as `nlp` and pass the instance around your application. | | [`Tokenizer`](/api/tokenizer) | Segment text, and create `Doc` objects with the discovered segment boundaries. | | [`Lemmatizer`](/api/lemmatizer) | Determine the base forms of words. | -| `Morphology` | Assign linguistic features like lemmas, noun case, verb tense etc. based on the word and its part-of-speech tag. | +| [`Morphology`](/api/morphology) | Assign linguistic features like lemmas, noun case, verb tense etc. based on the word and its part-of-speech tag. | | [`Tagger`](/api/tagger) | Annotate part-of-speech tags on `Doc` objects. | | [`DependencyParser`](/api/dependencyparser) | Annotate syntactic dependencies on `Doc` objects. | | [`EntityRecognizer`](/api/entityrecognizer) | Annotate named entities, e.g. persons or products, on `Doc` objects. | From 941b9e33f7c927844895a7c27e454297835300c9 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 22 Jul 2020 17:53:22 +0200 Subject: [PATCH 4/6] Add Token.morph_ --- website/docs/api/token.md | 1 + 1 file changed, 1 insertion(+) diff --git a/website/docs/api/token.md b/website/docs/api/token.md index e60a038ba..1cb833089 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -451,6 +451,7 @@ The L2 norm of the token's vector representation. | `tag` | int | Fine-grained part-of-speech. | | `tag_` | str | Fine-grained part-of-speech. | | `morph` | `MorphAnalysis` | Morphological analysis. | +| `morph_` | str | Morphological analysis in UD FEATS format. | | `dep` | int | Syntactic dependency relation. | | `dep_` | str | Syntactic dependency relation. | | `lang` | int | Language of the parent document's vocabulary. | From 8f44584bef4f41f5cbd72fd4292c1a727c6f33db Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 23 Jul 2020 08:51:31 +0200 Subject: [PATCH 5/6] Update MorphAnalysis.get and related examples --- website/docs/api/morphanalysis.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/website/docs/api/morphanalysis.md b/website/docs/api/morphanalysis.md index 7d883c86b..5c2356ad9 100644 --- a/website/docs/api/morphanalysis.md +++ b/website/docs/api/morphanalysis.md @@ -52,10 +52,9 @@ Iterate over the feature/value pairs in the analysis. > #### Example > > ```python -> feats = "Feat1=Val1|Feat2=Val2" +> feats = "Feat1=Val1,Val3|Feat2=Val2" > morph = MorphAnalysis(nlp.vocab, feats) -> for feat in morph: -> print(feat) +> assert list(morph) == ["Feat1=Va1", "Feat1=Val3", "Feat2=Val2"] > ``` | Name | Type | Description | @@ -99,14 +98,14 @@ Returns the morphological analysis in the UD FEATS string format. ## MorphAnalysis.get {#get tag="method"} -Retrieve a feature by field. +Retrieve values for a feature by field. > #### Example > > ```python > feats = "Feat1=Val1,Val2" > morph = MorphAnalysis(nlp.vocab, feats) -> assert morph.get("Feat1") == ['Feat1=Val1', 'Feat1=Val2'] +> assert morph.get("Feat1") == ["Val1", "Val2"] > ``` | Name | Type | Description | @@ -125,7 +124,7 @@ map. > ```python > feats = "Feat1=Val1,Val2|Feat2=Val2" > morph = MorphAnalysis(nlp.vocab, feats) -> assert morph.to_dict() == {'Feat1': 'Val1,Val2', 'Feat2': 'Val2'} +> assert morph.to_dict() == {"Feat1": "Val1,Val2", "Feat2": "Val2"} > ``` | Name | Type | Description | From 41525901efdec585a70c94940b18e44f3e474aab Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 23 Jul 2020 08:58:22 +0200 Subject: [PATCH 6/6] Move MorphAnalysis to Other section --- website/meta/sidebars.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index 1357c9d62..6685e565b 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -70,7 +70,6 @@ { "text": "Token", "url": "/api/token" }, { "text": "Span", "url": "/api/span" }, { "text": "Lexeme", "url": "/api/lexeme" }, - { "text": "MorphAnalysis", "url": "/api/morphanalysis" }, { "text": "Example", "url": "/api/example" }, { "text": "DocBin", "url": "/api/docbin" } ] @@ -104,6 +103,7 @@ { "text": "Vectors", "url": "/api/vectors" }, { "text": "Lookups", "url": "/api/lookups" }, { "text": "Morphology", "url": "/api/morphology" }, + { "text": "MorphAnalysis", "url": "/api/morphanalysis" }, { "text": "KnowledgeBase", "url": "/api/kb" }, { "text": "Scorer", "url": "/api/scorer" }, { "text": "Corpus", "url": "/api/corpus" }