Allow Doc.char_span to snap to token boundaries (#5849)

* Allow Doc.char_span to snap to token boundaries Add a `mode` option to allow `Doc.char_span` to snap to token boundaries. The `mode` options: * `strict`: character offsets must match token boundaries (default, same as before) * `inside`: all tokens completely within the character span * `outside`: all tokens at least partially covered by the character span Add a new helper function `token_by_char` that returns the token corresponding to a character position in the text. Update `token_by_start` and `token_by_end` to use `token_by_char` for more efficient searching. * Remove unused import * Rename mode to alignment_mode Rename `mode` to `alignment_mode` with the options `strict`/`contract`/`expand`. Any unrecognized modes are silently converted to `strict`.
2020-08-04 13:36:32 +02:00 · 2020-08-04 13:36:32 +02:00 · c62fd878a3
parent b841248589
commit c62fd878a3
3 changed files with 105 additions and 47 deletions
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@ -167,11 +167,30 @@ def test_spans_are_hashable(en_tokenizer):
 def test_spans_by_character(doc):
    span1 = doc[1:-2]
    # default and specified alignment mode "strict"
    span2 = doc.char_span(span1.start_char, span1.end_char, label="GPE")
    assert span1.start_char == span2.start_char
    assert span1.end_char == span2.end_char
    assert span2.label_ == "GPE"
    span2 = doc.char_span(span1.start_char, span1.end_char, label="GPE", alignment_mode="strict")
    assert span1.start_char == span2.start_char
    assert span1.end_char == span2.end_char
    assert span2.label_ == "GPE"
    # alignment mode "contract"
    span2 = doc.char_span(span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract")
    assert span1.start_char == span2.start_char
    assert span1.end_char == span2.end_char
    assert span2.label_ == "GPE"
    # alignment mode "expand"
    span2 = doc.char_span(span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="expand")
    assert span1.start_char == span2.start_char
    assert span1.end_char == span2.end_char
    assert span2.label_ == "GPE"
 def test_span_to_array(doc):
    span = doc[1:-2]
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -352,17 +352,25 @@ cdef class Doc:
    def doc(self):
        return self
-    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None):
+    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict"):
-        """Create a `Span` object from the slice `doc.text[start : end]`.
+        """Create a `Span` object from the slice
        `doc.text[start_idx : end_idx]`. Returns None if no valid `Span` can be
        created.
        doc (Doc): The parent document.
-        start (int): The index of the first character of the span.
+        start_idx (int): The index of the first character of the span.
-        end (int): The index of the first character after the span.
+        end_idx (int): The index of the first character after the span.
        label (uint64 or string): A label to attach to the Span, e.g. for
            named entities.
-        kb_id (uint64 or string):  An ID from a KB to capture the meaning of a named entity.
+        kb_id (uint64 or string):  An ID from a KB to capture the meaning of a
            named entity.
        vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
            the span.
        alignment_mode (str): How character indices are aligned to token
            boundaries. Options: "strict" (character indices must be aligned
            with token boundaries), "contract" (span of all tokens completely
            within the character span), "expand" (span of all tokens at least
            partially covered by the character span). Defaults to "strict".
        RETURNS (Span): The newly constructed object.
        DOCS: https://spacy.io/api/doc#char_span
@ -371,12 +379,29 @@ cdef class Doc:
            label = self.vocab.strings.add(label)
        if not isinstance(kb_id, int):
            kb_id = self.vocab.strings.add(kb_id)
-        cdef int start = token_by_start(self.c, self.length, start_idx)
+        if alignment_mode not in ("strict", "contract", "expand"):
-        if start == -1:
+            alignment_mode = "strict"
        cdef int start = token_by_char(self.c, self.length, start_idx)
        if start < 0 or (alignment_mode == "strict" and start_idx != self[start].idx):
            return None
-        cdef int end = token_by_end(self.c, self.length, end_idx)
+        # end_idx is exclusive, so find the token at one char before
-        if end == -1:
+        cdef int end = token_by_char(self.c, self.length, end_idx - 1)
        if end < 0 or (alignment_mode == "strict" and end_idx != self[end].idx + len(self[end])):
            return None
        # Adjust start and end by alignment_mode
        if alignment_mode == "contract":
            if self[start].idx < start_idx:
                start += 1
            if end_idx < self[end].idx + len(self[end]):
                end -= 1
            # if no tokens are completely within the span, return None
            if end < start:
                return None
        elif alignment_mode == "expand":
            # Don't consider the trailing whitespace to be part of the previous
            # token
            if start_idx == self[start].idx + len(self[start]):
                start += 1
        # Currently we have the token index, we want the range-end index
        end += 1
        cdef Span span = Span(self, start, end, label=label, kb_id=kb_id, vector=vector)
@ -1167,23 +1192,35 @@ cdef class Doc:
 cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
-    cdef int i
+    cdef int i = token_by_char(tokens, length, start_char)
-    for i in range(length):
+    if i >= 0 and tokens[i].idx == start_char:
        if tokens[i].idx == start_char:
        return i
    else:
        return -1
 cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2:
-    cdef int i
+    # end_char is exclusive, so find the token at one char before
-    for i in range(length):
+    cdef int i = token_by_char(tokens, length, end_char - 1)
-        if tokens[i].idx + tokens[i].lex.length == end_char:
+    if i >= 0 and tokens[i].idx + tokens[i].lex.length == end_char:
        return i
    else:
        return -1
 cdef int token_by_char(const TokenC* tokens, int length, int char_idx) except -2:
    cdef int start = 0, mid, end = length - 1
    while start <= end:
        mid = (start + end) / 2
        if char_idx < tokens[mid].idx:
            end = mid - 1
        elif char_idx >= tokens[mid].idx + tokens[mid].lex.length + tokens[mid].spacy:
            start = mid + 1
        else:
            return mid
    return -1
 cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
    cdef TokenC* head
    cdef TokenC* child
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@ -187,8 +187,9 @@ Remove a previously registered extension.
 ## Doc.char_span {#char_span tag="method" new="2"}
-Create a `Span` object from the slice `doc.text[start:end]`. Returns `None` if
+Create a `Span` object from the slice `doc.text[start_idx:end_idx]`. Returns
-the character indices don't map to a valid span.
+`None` if the character indices don't map to a valid span using the default mode
 `"strict".
 > #### Example
 >
@ -199,12 +200,13 @@ the character indices don't map to a valid span.
 > ```
 | Name                                 | Type                                     | Description                                                                                                                                                                                                                                                 |
-| ------------------------------------ | ---------------------------------------- | --------------------------------------------------------------------- |
+| ------------------------------------ | ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `start`                              | int                                      | The index of the first character of the span.                         |
+| `start_idx`                          | int                                      | The index of the first character of the span.                                                                                                                                                                                                               |
-| `end`                                | int                                      | The index of the last character after the span.                       |
+| `end_idx`                            | int                                      | The index of the last character after the span.                                                                                                                                                                                                             |
 | `label`                              | uint64 / unicode                         | A label to attach to the span, e.g. for named entities.                                                                                                                                                                                                     |
 | `kb_id` <Tag variant="new">2.2</Tag> | uint64 / unicode                         | An ID from a knowledge base to capture the meaning of a named entity.                                                                                                                                                                                       |
 | `vector`                             | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span.                                                                                                                                                                                                                       |
 | `mode`                               | `str`                                    | How character indices snap to token boundaries. Options: "strict" (no snapping), "inside" (span of all tokens completely within the character span), "outside" (span of all tokens at least partially covered by the character span). Defaults to "strict". |
 | **RETURNS**                          | `Span`                                   | The newly constructed object or `None`.                                                                                                                                                                                                                     |
 ## Doc.similarity {#similarity tag="method" model="vectors"}
@ -647,7 +649,7 @@ The L2 norm of the document's vector representation.
 ## Attributes {#attributes}
 | Name                                    | Type         | Description                                                                                                                                                                     |
-| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `text`                                  | unicode      | A unicode representation of the document text.                                                                                                                                  |
 | `text_with_ws`                          | unicode      | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`.                                                                                           |
 | `mem`                                   | `Pool`       | The document's local memory heap, for all C data it owns.                                                                                                                       |