Allow Doc.char_span to snap to token boundaries (#5849)

* Allow Doc.char_span to snap to token boundaries

Add a `mode` option to allow `Doc.char_span` to snap to token
boundaries. The `mode` options:

* `strict`: character offsets must match token boundaries (default, same as
before)
* `inside`: all tokens completely within the character span
* `outside`: all tokens at least partially covered by the character span

Add a new helper function `token_by_char` that returns the token
corresponding to a character position in the text. Update
`token_by_start` and `token_by_end` to use `token_by_char` for more
efficient searching.

* Remove unused import

* Rename mode to alignment_mode

Rename `mode` to `alignment_mode` with the options
`strict`/`contract`/`expand`. Any unrecognized modes are silently
converted to `strict`.
This commit is contained in:
Adriane Boyd 2020-08-04 13:36:32 +02:00 committed by GitHub
parent b841248589
commit c62fd878a3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 105 additions and 47 deletions

View File

@ -167,11 +167,30 @@ def test_spans_are_hashable(en_tokenizer):
def test_spans_by_character(doc):
span1 = doc[1:-2]
# default and specified alignment mode "strict"
span2 = doc.char_span(span1.start_char, span1.end_char, label="GPE")
assert span1.start_char == span2.start_char
assert span1.end_char == span2.end_char
assert span2.label_ == "GPE"
span2 = doc.char_span(span1.start_char, span1.end_char, label="GPE", alignment_mode="strict")
assert span1.start_char == span2.start_char
assert span1.end_char == span2.end_char
assert span2.label_ == "GPE"
# alignment mode "contract"
span2 = doc.char_span(span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract")
assert span1.start_char == span2.start_char
assert span1.end_char == span2.end_char
assert span2.label_ == "GPE"
# alignment mode "expand"
span2 = doc.char_span(span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="expand")
assert span1.start_char == span2.start_char
assert span1.end_char == span2.end_char
assert span2.label_ == "GPE"
def test_span_to_array(doc):
span = doc[1:-2]

View File

@ -352,17 +352,25 @@ cdef class Doc:
def doc(self):
return self
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None):
"""Create a `Span` object from the slice `doc.text[start : end]`.
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict"):
"""Create a `Span` object from the slice
`doc.text[start_idx : end_idx]`. Returns None if no valid `Span` can be
created.
doc (Doc): The parent document.
start (int): The index of the first character of the span.
end (int): The index of the first character after the span.
start_idx (int): The index of the first character of the span.
end_idx (int): The index of the first character after the span.
label (uint64 or string): A label to attach to the Span, e.g. for
named entities.
kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity.
kb_id (uint64 or string): An ID from a KB to capture the meaning of a
named entity.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
the span.
alignment_mode (str): How character indices are aligned to token
boundaries. Options: "strict" (character indices must be aligned
with token boundaries), "contract" (span of all tokens completely
within the character span), "expand" (span of all tokens at least
partially covered by the character span). Defaults to "strict".
RETURNS (Span): The newly constructed object.
DOCS: https://spacy.io/api/doc#char_span
@ -371,12 +379,29 @@ cdef class Doc:
label = self.vocab.strings.add(label)
if not isinstance(kb_id, int):
kb_id = self.vocab.strings.add(kb_id)
cdef int start = token_by_start(self.c, self.length, start_idx)
if start == -1:
if alignment_mode not in ("strict", "contract", "expand"):
alignment_mode = "strict"
cdef int start = token_by_char(self.c, self.length, start_idx)
if start < 0 or (alignment_mode == "strict" and start_idx != self[start].idx):
return None
cdef int end = token_by_end(self.c, self.length, end_idx)
if end == -1:
# end_idx is exclusive, so find the token at one char before
cdef int end = token_by_char(self.c, self.length, end_idx - 1)
if end < 0 or (alignment_mode == "strict" and end_idx != self[end].idx + len(self[end])):
return None
# Adjust start and end by alignment_mode
if alignment_mode == "contract":
if self[start].idx < start_idx:
start += 1
if end_idx < self[end].idx + len(self[end]):
end -= 1
# if no tokens are completely within the span, return None
if end < start:
return None
elif alignment_mode == "expand":
# Don't consider the trailing whitespace to be part of the previous
# token
if start_idx == self[start].idx + len(self[start]):
start += 1
# Currently we have the token index, we want the range-end index
end += 1
cdef Span span = Span(self, start, end, label=label, kb_id=kb_id, vector=vector)
@ -1167,23 +1192,35 @@ cdef class Doc:
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
cdef int i
for i in range(length):
if tokens[i].idx == start_char:
return i
cdef int i = token_by_char(tokens, length, start_char)
if i >= 0 and tokens[i].idx == start_char:
return i
else:
return -1
cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2:
cdef int i
for i in range(length):
if tokens[i].idx + tokens[i].lex.length == end_char:
return i
# end_char is exclusive, so find the token at one char before
cdef int i = token_by_char(tokens, length, end_char - 1)
if i >= 0 and tokens[i].idx + tokens[i].lex.length == end_char:
return i
else:
return -1
cdef int token_by_char(const TokenC* tokens, int length, int char_idx) except -2:
cdef int start = 0, mid, end = length - 1
while start <= end:
mid = (start + end) / 2
if char_idx < tokens[mid].idx:
end = mid - 1
elif char_idx >= tokens[mid].idx + tokens[mid].lex.length + tokens[mid].spacy:
start = mid + 1
else:
return mid
return -1
cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
cdef TokenC* head
cdef TokenC* child

View File

@ -187,8 +187,9 @@ Remove a previously registered extension.
## Doc.char_span {#char_span tag="method" new="2"}
Create a `Span` object from the slice `doc.text[start:end]`. Returns `None` if
the character indices don't map to a valid span.
Create a `Span` object from the slice `doc.text[start_idx:end_idx]`. Returns
`None` if the character indices don't map to a valid span using the default mode
`"strict".
> #### Example
>
@ -198,14 +199,15 @@ the character indices don't map to a valid span.
> assert span.text == "New York"
> ```
| Name | Type | Description |
| ------------------------------------ | ---------------------------------------- | --------------------------------------------------------------------- |
| `start` | int | The index of the first character of the span. |
| `end` | int | The index of the last character after the span. |
| `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. |
| `kb_id` <Tag variant="new">2.2</Tag> | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. |
| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. |
| **RETURNS** | `Span` | The newly constructed object or `None`. |
| Name | Type | Description |
| ------------------------------------ | ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `start_idx` | int | The index of the first character of the span. |
| `end_idx` | int | The index of the last character after the span. |
| `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. |
| `kb_id` <Tag variant="new">2.2</Tag> | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. |
| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. |
| `mode` | `str` | How character indices snap to token boundaries. Options: "strict" (no snapping), "inside" (span of all tokens completely within the character span), "outside" (span of all tokens at least partially covered by the character span). Defaults to "strict". |
| **RETURNS** | `Span` | The newly constructed object or `None`. |
## Doc.similarity {#similarity tag="method" model="vectors"}
@ -646,26 +648,26 @@ The L2 norm of the document's vector representation.
## Attributes {#attributes}
| Name | Type | Description |
| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `text` | unicode | A unicode representation of the document text. |
| `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. |
| `mem` | `Pool` | The document's local memory heap, for all C data it owns. |
| `vocab` | `Vocab` | The store of lexical types. |
| `tensor` <Tag variant="new">2</Tag> | `ndarray` | Container for dense vector representations. |
| `cats` <Tag variant="new">2</Tag> | dict | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. |
| `user_data` | - | A generic storage area, for user custom data. |
| `lang` <Tag variant="new">2.1</Tag> | int | Language of the document's vocabulary. |
| `lang_` <Tag variant="new">2.1</Tag> | unicode | Language of the document's vocabulary. |
| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. |
| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. |
| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. |
| `is_nered` <Tag variant="new">2.1</Tag> | bool | A flag indicating that named entities have been set. Will return `True` if the `Doc` is empty, or if _any_ of the tokens has an entity tag set, even if the others are unknown. |
| `sentiment` | float | The document's positivity/negativity score, if available. |
| `user_hooks` | dict | A dictionary that allows customization of the `Doc`'s properties. |
| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. |
| `user_span_hooks` | dict | A dictionary that allows customization of properties of `Span` children. |
| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |
| Name | Type | Description |
| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `text` | unicode | A unicode representation of the document text. |
| `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. |
| `mem` | `Pool` | The document's local memory heap, for all C data it owns. |
| `vocab` | `Vocab` | The store of lexical types. |
| `tensor` <Tag variant="new">2</Tag> | `ndarray` | Container for dense vector representations. |
| `cats` <Tag variant="new">2</Tag> | dict | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. |
| `user_data` | - | A generic storage area, for user custom data. |
| `lang` <Tag variant="new">2.1</Tag> | int | Language of the document's vocabulary. |
| `lang_` <Tag variant="new">2.1</Tag> | unicode | Language of the document's vocabulary. |
| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. |
| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. |
| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. |
| `is_nered` <Tag variant="new">2.1</Tag> | bool | A flag indicating that named entities have been set. Will return `True` if the `Doc` is empty, or if _any_ of the tokens has an entity tag set, even if the others are unknown. |
| `sentiment` | float | The document's positivity/negativity score, if available. |
| `user_hooks` | dict | A dictionary that allows customization of the `Doc`'s properties. |
| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. |
| `user_span_hooks` | dict | A dictionary that allows customization of properties of `Span` children. |
| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |
## Serialization fields {#serialization-fields}