From 4096a79de74f3edd2ce4499f7b4e7ea2d2ba47dc Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 27 Jan 2021 13:40:42 +0100
Subject: [PATCH] Add alignment mode error and fix Doc.char_span docs (#6820)

* Raise an error on an unrecognized alignment mode rather than
defaulting to `strict`
* Fix the `Doc.char_span` API doc alignment mode details
---
 spacy/errors.py              |  1 +
 spacy/tests/doc/test_span.py |  6 ++++++
 spacy/tokens/doc.pyx         |  5 +++--
 website/docs/api/doc.md      | 18 +++++++++---------
 4 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 7f9164694..9a6123b1f 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -591,6 +591,7 @@ class Errors(object):
     E200 = ("Specifying a base model with a pretrained component '{component}' "
             "can not be combined with adding a pretrained Tok2Vec layer.")
     E201 = ("Span index out of range.")
+    E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
 
 
 @add_codes
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index df41aedf5..a5da50fbd 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -197,6 +197,12 @@ def test_spans_by_character(doc):
     assert span1.end_char == span2.end_char
     assert span2.label_ == "GPE"
 
+    # unsupported alignment mode
+    with pytest.raises(ValueError):
+        span2 = doc.char_span(
+            span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="unk"
+        )
+
 
 def test_span_to_array(doc):
     span = doc[1:-2]
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index aa8d1cc19..584f9d483 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -379,8 +379,9 @@ cdef class Doc:
             label = self.vocab.strings.add(label)
         if not isinstance(kb_id, int):
             kb_id = self.vocab.strings.add(kb_id)
-        if alignment_mode not in ("strict", "contract", "expand"):
-            alignment_mode = "strict"
+        alignment_modes = ("strict", "contract", "expand")
+        if alignment_mode not in alignment_modes:
+            raise ValueError(Errors.E202.format(mode=alignment_mode, modes=", ".join(alignment_modes)))
         cdef int start = token_by_char(self.c, self.length, start_idx)
         if start < 0 or (alignment_mode == "strict" and start_idx != self[start].idx):
             return None
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index 93f63e582..077e6a13a 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -199,15 +199,15 @@ Create a `Span` object from the slice `doc.text[start_idx:end_idx]`. Returns
 > assert span.text == "New York"
 > ```
 
-| Name                                 | Type                                     | Description                                                                                                                                                                                                                                                 |
-| ------------------------------------ | ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `start_idx`                          | int                                      | The index of the first character of the span.                                                                                                                                                                                                               |
-| `end_idx`                            | int                                      | The index of the last character after the span.                                                                                                                                                                                                             |
-| `label`                              | uint64 / unicode                         | A label to attach to the span, e.g. for named entities.                                                                                                                                                                                                     |
-| `kb_id` <Tag variant="new">2.2</Tag> | uint64 / unicode                         | An ID from a knowledge base to capture the meaning of a named entity.                                                                                                                                                                                       |
-| `vector`                             | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span.                                                                                                                                                                                                                       |
-| `alignment_mode`                     | `str`                                    | How character indices snap to token boundaries. Options: "strict" (no snapping), "inside" (span of all tokens completely within the character span), "outside" (span of all tokens at least partially covered by the character span). Defaults to "strict". |
-| **RETURNS**                          | `Span`                                   | The newly constructed object or `None`.                                                                                                                                                                                                                     |
+| Name                                 | Type                                     | Description                                                                                                                                                                                                                                                  |
+| ------------------------------------ | ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `start_idx`                          | int                                      | The index of the first character of the span.                                                                                                                                                                                                                |
+| `end_idx`                            | int                                      | The index of the last character after the span.                                                                                                                                                                                                              |
+| `label`                              | uint64 / unicode                         | A label to attach to the span, e.g. for named entities.                                                                                                                                                                                                      |
+| `kb_id` <Tag variant="new">2.2</Tag> | uint64 / unicode                         | An ID from a knowledge base to capture the meaning of a named entity.                                                                                                                                                                                        |
+| `vector`                             | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span.                                                                                                                                                                                                                        |
+| `alignment_mode`                     | `str`                                    | How character indices snap to token boundaries. Options: "strict" (no snapping), "contract" (span of all tokens completely within the character span), "expand" (span of all tokens at least partially covered by the character span). Defaults to "strict". |
+| **RETURNS**                          | `Span`                                   | The newly constructed object or `None`.                                                                                                                                                                                                                      |
 
 ## Doc.similarity {#similarity tag="method" model="vectors"}