From 457babfa0c581d868fe16b418c0dcef357d78a97 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 16 Jun 2020 20:22:03 +0200 Subject: [PATCH 1/5] Update alignment example for new gold.align --- website/docs/usage/linguistic-features.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index bcc943436..a442cc7a0 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -1130,9 +1130,9 @@ from spacy.gold import align other_tokens = ["i", "listened", "to", "obama", "'", "s", "podcasts", "."] spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."] cost, a2b, b2a, a2b_multi, b2a_multi = align(other_tokens, spacy_tokens) -print("Misaligned tokens:", cost) # 2 +print("Edit distance:", cost) # 3 print("One-to-one mappings a -> b", a2b) # array([0, 1, 2, 3, -1, -1, 5, 6]) -print("One-to-one mappings b -> a", b2a) # array([0, 1, 2, 3, 5, 6, 7]) +print("One-to-one mappings b -> a", b2a) # array([0, 1, 2, 3, -1, 6, 7]) print("Many-to-one mappings a -> b", a2b_multi) # {4: 4, 5: 4} print("Many-to-one mappings b-> a", b2a_multi) # {} ``` @@ -1140,7 +1140,7 @@ print("Many-to-one mappings b-> a", b2a_multi) # {} Here are some insights from the alignment information generated in the example above: -- Two tokens are misaligned. +- The edit distance (cost) is `3`: two deletions and one insertion. - The one-to-one mappings for the first four tokens are identical, which means they map to each other. This makes sense because they're also identical in the input: `"i"`, `"listened"`, `"to"` and `"obama"`. From 9aff317ca788cc996da5125e7d9c4783c8ab9f7e Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 16 Jun 2020 20:26:57 +0200 Subject: [PATCH 2/5] Update POS in tagging example --- website/docs/usage/101/_pos-deps.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/101/_pos-deps.md b/website/docs/usage/101/_pos-deps.md index 1a438e424..1e8960edf 100644 --- a/website/docs/usage/101/_pos-deps.md +++ b/website/docs/usage/101/_pos-deps.md @@ -36,7 +36,7 @@ for token in doc: | Text | Lemma | POS | Tag | Dep | Shape | alpha | stop | | ------- | ------- | ------- | ----- | ---------- | ------- | ------- | ------- | | Apple | apple | `PROPN` | `NNP` | `nsubj` | `Xxxxx` | `True` | `False` | -| is | be | `VERB` | `VBZ` | `aux` | `xx` | `True` | `True` | +| is | be | `AUX` | `VBZ` | `aux` | `xx` | `True` | `True` | | looking | look | `VERB` | `VBG` | `ROOT` | `xxxx` | `True` | `False` | | at | at | `ADP` | `IN` | `prep` | `xx` | `True` | `True` | | buying | buy | `VERB` | `VBG` | `pcomp` | `xxxx` | `True` | `False` | From a6abdfbc3c5a298b9d0e547451701f6705fd09b7 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 16 Jun 2020 20:35:45 +0200 Subject: [PATCH 3/5] Fix numpy.zeros() dtype for Doc.from_array --- website/docs/usage/linguistic-features.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index a442cc7a0..1e3b129ac 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -471,7 +471,7 @@ doc = nlp.make_doc("London is a big city in the United Kingdom.") print("Before", doc.ents) # [] header = [ENT_IOB, ENT_TYPE] -attr_array = numpy.zeros((len(doc), len(header))) +attr_array = numpy.zeros((len(doc), len(header)), dtype="uint64") attr_array[0, 0] = 3 # B attr_array[0, 1] = doc.vocab.strings["GPE"] doc.from_array(header, attr_array) From f0fd77648fb488c26852cd1494b69073e5766b65 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 16 Jun 2020 20:36:21 +0200 Subject: [PATCH 4/5] Change example title to Dr. Change example title to Dr. so the current model does exclude the title in the initial example. --- website/docs/usage/rule-based-matching.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 1db2405d1..f7866fe31 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -1158,17 +1158,17 @@ what you need for your application. > available corpus. For example, the corpus spaCy's [English models](/models/en) were trained on -defines a `PERSON` entity as just the **person name**, without titles like "Mr" -or "Dr". This makes sense, because it makes it easier to resolve the entity type -back to a knowledge base. But what if your application needs the full names, -_including_ the titles? +defines a `PERSON` entity as just the **person name**, without titles like "Mr." +or "Dr.". This makes sense, because it makes it easier to resolve the entity +type back to a knowledge base. But what if your application needs the full +names, _including_ the titles? ```python ### {executable="true"} import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp("Dr Alex Smith chaired first board meeting of Acme Corp Inc.") +doc = nlp("Dr. Alex Smith chaired first board meeting of Acme Corp Inc.") print([(ent.text, ent.label_) for ent in doc.ents]) ``` @@ -1233,7 +1233,7 @@ def expand_person_entities(doc): # Add the component after the named entity recognizer nlp.add_pipe(expand_person_entities, after='ner') -doc = nlp("Dr Alex Smith chaired first board meeting of Acme Corp Inc.") +doc = nlp("Dr. Alex Smith chaired first board meeting of Acme Corp Inc.") print([(ent.text, ent.label_) for ent in doc.ents]) ``` From 02369f91d307a6ba43f1d9ad97efbb5e348cc599 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 16 Jun 2020 20:41:17 +0200 Subject: [PATCH 5/5] Fix spacy convert argument --- website/docs/usage/adding-languages.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/adding-languages.md b/website/docs/usage/adding-languages.md index 29de08266..98d4fdec9 100644 --- a/website/docs/usage/adding-languages.md +++ b/website/docs/usage/adding-languages.md @@ -634,7 +634,7 @@ One thing to keep in mind is that spaCy expects to train its models from **whole documents**, not just single sentences. If your corpus only contains single sentences, spaCy's models will never learn to expect multi-sentence documents, leading to low performance on real text. To mitigate this problem, you can use -the `-N` argument to the `spacy convert` command, to merge some of the sentences +the `-n` argument to the `spacy convert` command, to merge some of the sentences into longer pseudo-documents. ### Training the tagger and parser {#train-tagger-parser}