From 4096a79de74f3edd2ce4499f7b4e7ea2d2ba47dc Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 27 Jan 2021 13:40:42 +0100 Subject: [PATCH] Add alignment mode error and fix Doc.char_span docs (#6820) * Raise an error on an unrecognized alignment mode rather than defaulting to `strict` * Fix the `Doc.char_span` API doc alignment mode details --- spacy/errors.py | 1 + spacy/tests/doc/test_span.py | 6 ++++++ spacy/tokens/doc.pyx | 5 +++-- website/docs/api/doc.md | 18 +++++++++--------- 4 files changed, 19 insertions(+), 11 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 7f9164694..9a6123b1f 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -591,6 +591,7 @@ class Errors(object): E200 = ("Specifying a base model with a pretrained component '{component}' " "can not be combined with adding a pretrained Tok2Vec layer.") E201 = ("Span index out of range.") + E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.") @add_codes diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index df41aedf5..a5da50fbd 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -197,6 +197,12 @@ def test_spans_by_character(doc): assert span1.end_char == span2.end_char assert span2.label_ == "GPE" + # unsupported alignment mode + with pytest.raises(ValueError): + span2 = doc.char_span( + span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="unk" + ) + def test_span_to_array(doc): span = doc[1:-2] diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index aa8d1cc19..584f9d483 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -379,8 +379,9 @@ cdef class Doc: label = self.vocab.strings.add(label) if not isinstance(kb_id, int): kb_id = self.vocab.strings.add(kb_id) - if alignment_mode not in ("strict", "contract", "expand"): - alignment_mode = "strict" + alignment_modes = ("strict", "contract", "expand") + if alignment_mode not in alignment_modes: + raise ValueError(Errors.E202.format(mode=alignment_mode, modes=", ".join(alignment_modes))) cdef int start = token_by_char(self.c, self.length, start_idx) if start < 0 or (alignment_mode == "strict" and start_idx != self[start].idx): return None diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 93f63e582..077e6a13a 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -199,15 +199,15 @@ Create a `Span` object from the slice `doc.text[start_idx:end_idx]`. Returns > assert span.text == "New York" > ``` -| Name | Type | Description | -| ------------------------------------ | ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `start_idx` | int | The index of the first character of the span. | -| `end_idx` | int | The index of the last character after the span. | -| `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. | -| `kb_id` 2.2 | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. | -| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. | -| `alignment_mode` | `str` | How character indices snap to token boundaries. Options: "strict" (no snapping), "inside" (span of all tokens completely within the character span), "outside" (span of all tokens at least partially covered by the character span). Defaults to "strict". | -| **RETURNS** | `Span` | The newly constructed object or `None`. | +| Name | Type | Description | +| ------------------------------------ | ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `start_idx` | int | The index of the first character of the span. | +| `end_idx` | int | The index of the last character after the span. | +| `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. | +| `kb_id` 2.2 | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. | +| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. | +| `alignment_mode` | `str` | How character indices snap to token boundaries. Options: "strict" (no snapping), "contract" (span of all tokens completely within the character span), "expand" (span of all tokens at least partially covered by the character span). Defaults to "strict". | +| **RETURNS** | `Span` | The newly constructed object or `None`. | ## Doc.similarity {#similarity tag="method" model="vectors"}