From c62fd878a38fa0ce16243022b5dab5d043aaf31f Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 4 Aug 2020 13:36:32 +0200
Subject: [PATCH] Allow Doc.char_span to snap to token boundaries (#5849)

* Allow Doc.char_span to snap to token boundaries

Add a `mode` option to allow `Doc.char_span` to snap to token
boundaries. The `mode` options:

* `strict`: character offsets must match token boundaries (default, same as
before)
* `inside`: all tokens completely within the character span
* `outside`: all tokens at least partially covered by the character span

Add a new helper function `token_by_char` that returns the token
corresponding to a character position in the text. Update
`token_by_start` and `token_by_end` to use `token_by_char` for more
efficient searching.

* Remove unused import

* Rename mode to alignment_mode

Rename `mode` to `alignment_mode` with the options
`strict`/`contract`/`expand`. Any unrecognized modes are silently
converted to `strict`.
---
 spacy/tests/doc/test_span.py | 19 ++++++++++
 spacy/tokens/doc.pyx         | 71 +++++++++++++++++++++++++++---------
 website/docs/api/doc.md      | 62 ++++++++++++++++---------------
 3 files changed, 105 insertions(+), 47 deletions(-)

diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 25fa421b7..107078df9 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -167,11 +167,30 @@ def test_spans_are_hashable(en_tokenizer):
 
 def test_spans_by_character(doc):
     span1 = doc[1:-2]
+
+    # default and specified alignment mode "strict"
     span2 = doc.char_span(span1.start_char, span1.end_char, label="GPE")
     assert span1.start_char == span2.start_char
     assert span1.end_char == span2.end_char
     assert span2.label_ == "GPE"
 
+    span2 = doc.char_span(span1.start_char, span1.end_char, label="GPE", alignment_mode="strict")
+    assert span1.start_char == span2.start_char
+    assert span1.end_char == span2.end_char
+    assert span2.label_ == "GPE"
+
+    # alignment mode "contract"
+    span2 = doc.char_span(span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract")
+    assert span1.start_char == span2.start_char
+    assert span1.end_char == span2.end_char
+    assert span2.label_ == "GPE"
+
+    # alignment mode "expand"
+    span2 = doc.char_span(span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="expand")
+    assert span1.start_char == span2.start_char
+    assert span1.end_char == span2.end_char
+    assert span2.label_ == "GPE"
+
 
 def test_span_to_array(doc):
     span = doc[1:-2]
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 5b03dc5d2..89573ba09 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -352,17 +352,25 @@ cdef class Doc:
     def doc(self):
         return self
 
-    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None):
-        """Create a `Span` object from the slice `doc.text[start : end]`.
+    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict"):
+        """Create a `Span` object from the slice
+        `doc.text[start_idx : end_idx]`. Returns None if no valid `Span` can be
+        created.
 
         doc (Doc): The parent document.
-        start (int): The index of the first character of the span.
-        end (int): The index of the first character after the span.
+        start_idx (int): The index of the first character of the span.
+        end_idx (int): The index of the first character after the span.
         label (uint64 or string): A label to attach to the Span, e.g. for
             named entities.
-        kb_id (uint64 or string):  An ID from a KB to capture the meaning of a named entity.
+        kb_id (uint64 or string):  An ID from a KB to capture the meaning of a
+            named entity.
         vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
             the span.
+        alignment_mode (str): How character indices are aligned to token
+            boundaries. Options: "strict" (character indices must be aligned
+            with token boundaries), "contract" (span of all tokens completely
+            within the character span), "expand" (span of all tokens at least
+            partially covered by the character span). Defaults to "strict".
         RETURNS (Span): The newly constructed object.
 
         DOCS: https://spacy.io/api/doc#char_span
@@ -371,12 +379,29 @@ cdef class Doc:
             label = self.vocab.strings.add(label)
         if not isinstance(kb_id, int):
             kb_id = self.vocab.strings.add(kb_id)
-        cdef int start = token_by_start(self.c, self.length, start_idx)
-        if start == -1:
+        if alignment_mode not in ("strict", "contract", "expand"):
+            alignment_mode = "strict"
+        cdef int start = token_by_char(self.c, self.length, start_idx)
+        if start < 0 or (alignment_mode == "strict" and start_idx != self[start].idx):
             return None
-        cdef int end = token_by_end(self.c, self.length, end_idx)
-        if end == -1:
+        # end_idx is exclusive, so find the token at one char before
+        cdef int end = token_by_char(self.c, self.length, end_idx - 1)
+        if end < 0 or (alignment_mode == "strict" and end_idx != self[end].idx + len(self[end])):
             return None
+        # Adjust start and end by alignment_mode
+        if alignment_mode == "contract":
+            if self[start].idx < start_idx:
+                start += 1
+            if end_idx < self[end].idx + len(self[end]):
+                end -= 1
+            # if no tokens are completely within the span, return None
+            if end < start:
+                return None
+        elif alignment_mode == "expand":
+            # Don't consider the trailing whitespace to be part of the previous
+            # token
+            if start_idx == self[start].idx + len(self[start]):
+                start += 1
         # Currently we have the token index, we want the range-end index
         end += 1
         cdef Span span = Span(self, start, end, label=label, kb_id=kb_id, vector=vector)
@@ -1167,23 +1192,35 @@ cdef class Doc:
 
 
 cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
-    cdef int i
-    for i in range(length):
-        if tokens[i].idx == start_char:
-            return i
+    cdef int i = token_by_char(tokens, length, start_char)
+    if i >= 0 and tokens[i].idx == start_char:
+        return i
     else:
         return -1
 
 
 cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2:
-    cdef int i
-    for i in range(length):
-        if tokens[i].idx + tokens[i].lex.length == end_char:
-            return i
+    # end_char is exclusive, so find the token at one char before
+    cdef int i = token_by_char(tokens, length, end_char - 1)
+    if i >= 0 and tokens[i].idx + tokens[i].lex.length == end_char:
+        return i
     else:
         return -1
 
 
+cdef int token_by_char(const TokenC* tokens, int length, int char_idx) except -2:
+    cdef int start = 0, mid, end = length - 1
+    while start <= end:
+        mid = (start + end) / 2
+        if char_idx < tokens[mid].idx:
+            end = mid - 1
+        elif char_idx >= tokens[mid].idx + tokens[mid].lex.length + tokens[mid].spacy:
+            start = mid + 1
+        else:
+            return mid
+    return -1
+
+
 cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
     cdef TokenC* head
     cdef TokenC* child
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index 7decc2278..420e12fcb 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -187,8 +187,9 @@ Remove a previously registered extension.
 
 ## Doc.char_span {#char_span tag="method" new="2"}
 
-Create a `Span` object from the slice `doc.text[start:end]`. Returns `None` if
-the character indices don't map to a valid span.
+Create a `Span` object from the slice `doc.text[start_idx:end_idx]`. Returns
+`None` if the character indices don't map to a valid span using the default mode
+`"strict".
 
 > #### Example
 >
@@ -198,14 +199,15 @@ the character indices don't map to a valid span.
 > assert span.text == "New York"
 > ```
 
-| Name                                 | Type                                     | Description                                                           |
-| ------------------------------------ | ---------------------------------------- | --------------------------------------------------------------------- |
-| `start`                              | int                                      | The index of the first character of the span.                         |
-| `end`                                | int                                      | The index of the last character after the span.                       |
-| `label`                              | uint64 / unicode                         | A label to attach to the span, e.g. for named entities.               |
-| `kb_id` <Tag variant="new">2.2</Tag> | uint64 / unicode                         | An ID from a knowledge base to capture the meaning of a named entity. |
-| `vector`                             | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span.                                 |
-| **RETURNS**                          | `Span`                                   | The newly constructed object or `None`.                               |
+| Name                                 | Type                                     | Description                                                                                                                                                                                                                                                 |
+| ------------------------------------ | ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `start_idx`                          | int                                      | The index of the first character of the span.                                                                                                                                                                                                               |
+| `end_idx`                            | int                                      | The index of the last character after the span.                                                                                                                                                                                                             |
+| `label`                              | uint64 / unicode                         | A label to attach to the span, e.g. for named entities.                                                                                                                                                                                                     |
+| `kb_id` <Tag variant="new">2.2</Tag> | uint64 / unicode                         | An ID from a knowledge base to capture the meaning of a named entity.                                                                                                                                                                                       |
+| `vector`                             | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span.                                                                                                                                                                                                                       |
+| `mode`                               | `str`                                    | How character indices snap to token boundaries. Options: "strict" (no snapping), "inside" (span of all tokens completely within the character span), "outside" (span of all tokens at least partially covered by the character span). Defaults to "strict". |
+| **RETURNS**                          | `Span`                                   | The newly constructed object or `None`.                                                                                                                                                                                                                     |
 
 ## Doc.similarity {#similarity tag="method" model="vectors"}
 
@@ -646,26 +648,26 @@ The L2 norm of the document's vector representation.
 
 ## Attributes {#attributes}
 
-| Name                                    | Type         | Description                                                                                                                                                                                                                                                                                |
-| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `text`                                  | unicode      | A unicode representation of the document text.                                                                                                                                                                                                                                             |
-| `text_with_ws`                          | unicode      | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`.                                                                                                                                                                                                      |
-| `mem`                                   | `Pool`       | The document's local memory heap, for all C data it owns.                                                                                                                                                                                                                                  |
-| `vocab`                                 | `Vocab`      | The store of lexical types.                                                                                                                                                                                                                                                                |
-| `tensor` <Tag variant="new">2</Tag>     | `ndarray`    | Container for dense vector representations.                                                                                                                                                                                                                                                |
-| `cats` <Tag variant="new">2</Tag>       | dict         | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float.                                                                                     |
-| `user_data`                             | -            | A generic storage area, for user custom data.                                                                                                                                                                                                                                              |
-| `lang` <Tag variant="new">2.1</Tag>     | int          | Language of the document's vocabulary.                                                                                                                                                                                                                                                     |
-| `lang_` <Tag variant="new">2.1</Tag>    | unicode      | Language of the document's vocabulary.                                                                                                                                                                                                                                                     |
-| `is_tagged`                             | bool         | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty.                                                                                                                                                                                  |
-| `is_parsed`                             | bool         | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty.                                                                                                                                                                                   |
-| `is_sentenced`                          | bool         | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty.                                                                                                                                                                        |
-| `is_nered` <Tag variant="new">2.1</Tag> | bool         | A flag indicating that named entities have been set. Will return `True` if the `Doc` is empty, or if _any_ of the tokens has an entity tag set, even if the others are unknown.                                                                                                            |
-| `sentiment`                             | float        | The document's positivity/negativity score, if available.                                                                                                                                                                                                                                  |
-| `user_hooks`                            | dict         | A dictionary that allows customization of the `Doc`'s properties.                                                                                                                                                                                                                          |
-| `user_token_hooks`                      | dict         | A dictionary that allows customization of properties of `Token` children.                                                                                                                                                                                                                  |
-| `user_span_hooks`                       | dict         | A dictionary that allows customization of properties of `Span` children.                                                                                                                                                                                                                   |
-| `_`                                     | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes).                                                                                                                                                                             |
+| Name                                    | Type         | Description                                                                                                                                                                     |
+| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `text`                                  | unicode      | A unicode representation of the document text.                                                                                                                                  |
+| `text_with_ws`                          | unicode      | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`.                                                                                           |
+| `mem`                                   | `Pool`       | The document's local memory heap, for all C data it owns.                                                                                                                       |
+| `vocab`                                 | `Vocab`      | The store of lexical types.                                                                                                                                                     |
+| `tensor` <Tag variant="new">2</Tag>     | `ndarray`    | Container for dense vector representations.                                                                                                                                     |
+| `cats` <Tag variant="new">2</Tag>       | dict         | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float.                                                          |
+| `user_data`                             | -            | A generic storage area, for user custom data.                                                                                                                                   |
+| `lang` <Tag variant="new">2.1</Tag>     | int          | Language of the document's vocabulary.                                                                                                                                          |
+| `lang_` <Tag variant="new">2.1</Tag>    | unicode      | Language of the document's vocabulary.                                                                                                                                          |
+| `is_tagged`                             | bool         | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty.                                                                       |
+| `is_parsed`                             | bool         | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty.                                                                        |
+| `is_sentenced`                          | bool         | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty.                                                             |
+| `is_nered` <Tag variant="new">2.1</Tag> | bool         | A flag indicating that named entities have been set. Will return `True` if the `Doc` is empty, or if _any_ of the tokens has an entity tag set, even if the others are unknown. |
+| `sentiment`                             | float        | The document's positivity/negativity score, if available.                                                                                                                       |
+| `user_hooks`                            | dict         | A dictionary that allows customization of the `Doc`'s properties.                                                                                                               |
+| `user_token_hooks`                      | dict         | A dictionary that allows customization of properties of `Token` children.                                                                                                       |
+| `user_span_hooks`                       | dict         | A dictionary that allows customization of properties of `Span` children.                                                                                                        |
+| `_`                                     | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes).                                                                  |
 
 ## Serialization fields {#serialization-fields}