Allow Doc.char_span to snap to token boundaries (#5849)

* Allow Doc.char_span to snap to token boundaries

Add a `mode` option to allow `Doc.char_span` to snap to token
boundaries. The `mode` options:

* `strict`: character offsets must match token boundaries (default, same as
before)
* `inside`: all tokens completely within the character span
* `outside`: all tokens at least partially covered by the character span

Add a new helper function `token_by_char` that returns the token
corresponding to a character position in the text. Update
`token_by_start` and `token_by_end` to use `token_by_char` for more
efficient searching.

* Remove unused import

* Rename mode to alignment_mode

Rename `mode` to `alignment_mode` with the options
`strict`/`contract`/`expand`. Any unrecognized modes are silently
converted to `strict`.
This commit is contained in:
Adriane Boyd 2020-08-04 13:36:32 +02:00 committed by GitHub
parent b841248589
commit c62fd878a3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 105 additions and 47 deletions

View File

@ -167,11 +167,30 @@ def test_spans_are_hashable(en_tokenizer):
def test_spans_by_character(doc): def test_spans_by_character(doc):
span1 = doc[1:-2] span1 = doc[1:-2]
# default and specified alignment mode "strict"
span2 = doc.char_span(span1.start_char, span1.end_char, label="GPE") span2 = doc.char_span(span1.start_char, span1.end_char, label="GPE")
assert span1.start_char == span2.start_char assert span1.start_char == span2.start_char
assert span1.end_char == span2.end_char assert span1.end_char == span2.end_char
assert span2.label_ == "GPE" assert span2.label_ == "GPE"
span2 = doc.char_span(span1.start_char, span1.end_char, label="GPE", alignment_mode="strict")
assert span1.start_char == span2.start_char
assert span1.end_char == span2.end_char
assert span2.label_ == "GPE"
# alignment mode "contract"
span2 = doc.char_span(span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract")
assert span1.start_char == span2.start_char
assert span1.end_char == span2.end_char
assert span2.label_ == "GPE"
# alignment mode "expand"
span2 = doc.char_span(span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="expand")
assert span1.start_char == span2.start_char
assert span1.end_char == span2.end_char
assert span2.label_ == "GPE"
def test_span_to_array(doc): def test_span_to_array(doc):
span = doc[1:-2] span = doc[1:-2]

View File

@ -352,17 +352,25 @@ cdef class Doc:
def doc(self): def doc(self):
return self return self
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None): def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict"):
"""Create a `Span` object from the slice `doc.text[start : end]`. """Create a `Span` object from the slice
`doc.text[start_idx : end_idx]`. Returns None if no valid `Span` can be
created.
doc (Doc): The parent document. doc (Doc): The parent document.
start (int): The index of the first character of the span. start_idx (int): The index of the first character of the span.
end (int): The index of the first character after the span. end_idx (int): The index of the first character after the span.
label (uint64 or string): A label to attach to the Span, e.g. for label (uint64 or string): A label to attach to the Span, e.g. for
named entities. named entities.
kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity. kb_id (uint64 or string): An ID from a KB to capture the meaning of a
named entity.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
the span. the span.
alignment_mode (str): How character indices are aligned to token
boundaries. Options: "strict" (character indices must be aligned
with token boundaries), "contract" (span of all tokens completely
within the character span), "expand" (span of all tokens at least
partially covered by the character span). Defaults to "strict".
RETURNS (Span): The newly constructed object. RETURNS (Span): The newly constructed object.
DOCS: https://spacy.io/api/doc#char_span DOCS: https://spacy.io/api/doc#char_span
@ -371,12 +379,29 @@ cdef class Doc:
label = self.vocab.strings.add(label) label = self.vocab.strings.add(label)
if not isinstance(kb_id, int): if not isinstance(kb_id, int):
kb_id = self.vocab.strings.add(kb_id) kb_id = self.vocab.strings.add(kb_id)
cdef int start = token_by_start(self.c, self.length, start_idx) if alignment_mode not in ("strict", "contract", "expand"):
if start == -1: alignment_mode = "strict"
cdef int start = token_by_char(self.c, self.length, start_idx)
if start < 0 or (alignment_mode == "strict" and start_idx != self[start].idx):
return None return None
cdef int end = token_by_end(self.c, self.length, end_idx) # end_idx is exclusive, so find the token at one char before
if end == -1: cdef int end = token_by_char(self.c, self.length, end_idx - 1)
if end < 0 or (alignment_mode == "strict" and end_idx != self[end].idx + len(self[end])):
return None return None
# Adjust start and end by alignment_mode
if alignment_mode == "contract":
if self[start].idx < start_idx:
start += 1
if end_idx < self[end].idx + len(self[end]):
end -= 1
# if no tokens are completely within the span, return None
if end < start:
return None
elif alignment_mode == "expand":
# Don't consider the trailing whitespace to be part of the previous
# token
if start_idx == self[start].idx + len(self[start]):
start += 1
# Currently we have the token index, we want the range-end index # Currently we have the token index, we want the range-end index
end += 1 end += 1
cdef Span span = Span(self, start, end, label=label, kb_id=kb_id, vector=vector) cdef Span span = Span(self, start, end, label=label, kb_id=kb_id, vector=vector)
@ -1167,23 +1192,35 @@ cdef class Doc:
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2: cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
cdef int i cdef int i = token_by_char(tokens, length, start_char)
for i in range(length): if i >= 0 and tokens[i].idx == start_char:
if tokens[i].idx == start_char:
return i return i
else: else:
return -1 return -1
cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2: cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2:
cdef int i # end_char is exclusive, so find the token at one char before
for i in range(length): cdef int i = token_by_char(tokens, length, end_char - 1)
if tokens[i].idx + tokens[i].lex.length == end_char: if i >= 0 and tokens[i].idx + tokens[i].lex.length == end_char:
return i return i
else: else:
return -1 return -1
cdef int token_by_char(const TokenC* tokens, int length, int char_idx) except -2:
cdef int start = 0, mid, end = length - 1
while start <= end:
mid = (start + end) / 2
if char_idx < tokens[mid].idx:
end = mid - 1
elif char_idx >= tokens[mid].idx + tokens[mid].lex.length + tokens[mid].spacy:
start = mid + 1
else:
return mid
return -1
cdef int set_children_from_heads(TokenC* tokens, int length) except -1: cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
cdef TokenC* head cdef TokenC* head
cdef TokenC* child cdef TokenC* child

View File

@ -187,8 +187,9 @@ Remove a previously registered extension.
## Doc.char_span {#char_span tag="method" new="2"} ## Doc.char_span {#char_span tag="method" new="2"}
Create a `Span` object from the slice `doc.text[start:end]`. Returns `None` if Create a `Span` object from the slice `doc.text[start_idx:end_idx]`. Returns
the character indices don't map to a valid span. `None` if the character indices don't map to a valid span using the default mode
`"strict".
> #### Example > #### Example
> >
@ -199,12 +200,13 @@ the character indices don't map to a valid span.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------------------------------------ | ---------------------------------------- | --------------------------------------------------------------------- | | ------------------------------------ | ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `start` | int | The index of the first character of the span. | | `start_idx` | int | The index of the first character of the span. |
| `end` | int | The index of the last character after the span. | | `end_idx` | int | The index of the last character after the span. |
| `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. | | `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. |
| `kb_id` <Tag variant="new">2.2</Tag> | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. | | `kb_id` <Tag variant="new">2.2</Tag> | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. |
| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. | | `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. |
| `mode` | `str` | How character indices snap to token boundaries. Options: "strict" (no snapping), "inside" (span of all tokens completely within the character span), "outside" (span of all tokens at least partially covered by the character span). Defaults to "strict". |
| **RETURNS** | `Span` | The newly constructed object or `None`. | | **RETURNS** | `Span` | The newly constructed object or `None`. |
## Doc.similarity {#similarity tag="method" model="vectors"} ## Doc.similarity {#similarity tag="method" model="vectors"}
@ -647,7 +649,7 @@ The L2 norm of the document's vector representation.
## Attributes {#attributes} ## Attributes {#attributes}
| Name | Type | Description | | Name | Type | Description |
| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `text` | unicode | A unicode representation of the document text. | | `text` | unicode | A unicode representation of the document text. |
| `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. | | `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. |
| `mem` | `Pool` | The document's local memory heap, for all C data it owns. | | `mem` | `Pool` | The document's local memory heap, for all C data it owns. |