mirror of https://github.com/explosion/spaCy.git
Allow Doc.char_span to snap to token boundaries (#5849)
* Allow Doc.char_span to snap to token boundaries Add a `mode` option to allow `Doc.char_span` to snap to token boundaries. The `mode` options: * `strict`: character offsets must match token boundaries (default, same as before) * `inside`: all tokens completely within the character span * `outside`: all tokens at least partially covered by the character span Add a new helper function `token_by_char` that returns the token corresponding to a character position in the text. Update `token_by_start` and `token_by_end` to use `token_by_char` for more efficient searching. * Remove unused import * Rename mode to alignment_mode Rename `mode` to `alignment_mode` with the options `strict`/`contract`/`expand`. Any unrecognized modes are silently converted to `strict`.
This commit is contained in:
parent
b841248589
commit
c62fd878a3
|
@ -167,11 +167,30 @@ def test_spans_are_hashable(en_tokenizer):
|
|||
|
||||
def test_spans_by_character(doc):
|
||||
span1 = doc[1:-2]
|
||||
|
||||
# default and specified alignment mode "strict"
|
||||
span2 = doc.char_span(span1.start_char, span1.end_char, label="GPE")
|
||||
assert span1.start_char == span2.start_char
|
||||
assert span1.end_char == span2.end_char
|
||||
assert span2.label_ == "GPE"
|
||||
|
||||
span2 = doc.char_span(span1.start_char, span1.end_char, label="GPE", alignment_mode="strict")
|
||||
assert span1.start_char == span2.start_char
|
||||
assert span1.end_char == span2.end_char
|
||||
assert span2.label_ == "GPE"
|
||||
|
||||
# alignment mode "contract"
|
||||
span2 = doc.char_span(span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract")
|
||||
assert span1.start_char == span2.start_char
|
||||
assert span1.end_char == span2.end_char
|
||||
assert span2.label_ == "GPE"
|
||||
|
||||
# alignment mode "expand"
|
||||
span2 = doc.char_span(span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="expand")
|
||||
assert span1.start_char == span2.start_char
|
||||
assert span1.end_char == span2.end_char
|
||||
assert span2.label_ == "GPE"
|
||||
|
||||
|
||||
def test_span_to_array(doc):
|
||||
span = doc[1:-2]
|
||||
|
|
|
@ -352,17 +352,25 @@ cdef class Doc:
|
|||
def doc(self):
|
||||
return self
|
||||
|
||||
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None):
|
||||
"""Create a `Span` object from the slice `doc.text[start : end]`.
|
||||
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict"):
|
||||
"""Create a `Span` object from the slice
|
||||
`doc.text[start_idx : end_idx]`. Returns None if no valid `Span` can be
|
||||
created.
|
||||
|
||||
doc (Doc): The parent document.
|
||||
start (int): The index of the first character of the span.
|
||||
end (int): The index of the first character after the span.
|
||||
start_idx (int): The index of the first character of the span.
|
||||
end_idx (int): The index of the first character after the span.
|
||||
label (uint64 or string): A label to attach to the Span, e.g. for
|
||||
named entities.
|
||||
kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity.
|
||||
kb_id (uint64 or string): An ID from a KB to capture the meaning of a
|
||||
named entity.
|
||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
|
||||
the span.
|
||||
alignment_mode (str): How character indices are aligned to token
|
||||
boundaries. Options: "strict" (character indices must be aligned
|
||||
with token boundaries), "contract" (span of all tokens completely
|
||||
within the character span), "expand" (span of all tokens at least
|
||||
partially covered by the character span). Defaults to "strict".
|
||||
RETURNS (Span): The newly constructed object.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#char_span
|
||||
|
@ -371,12 +379,29 @@ cdef class Doc:
|
|||
label = self.vocab.strings.add(label)
|
||||
if not isinstance(kb_id, int):
|
||||
kb_id = self.vocab.strings.add(kb_id)
|
||||
cdef int start = token_by_start(self.c, self.length, start_idx)
|
||||
if start == -1:
|
||||
if alignment_mode not in ("strict", "contract", "expand"):
|
||||
alignment_mode = "strict"
|
||||
cdef int start = token_by_char(self.c, self.length, start_idx)
|
||||
if start < 0 or (alignment_mode == "strict" and start_idx != self[start].idx):
|
||||
return None
|
||||
cdef int end = token_by_end(self.c, self.length, end_idx)
|
||||
if end == -1:
|
||||
# end_idx is exclusive, so find the token at one char before
|
||||
cdef int end = token_by_char(self.c, self.length, end_idx - 1)
|
||||
if end < 0 or (alignment_mode == "strict" and end_idx != self[end].idx + len(self[end])):
|
||||
return None
|
||||
# Adjust start and end by alignment_mode
|
||||
if alignment_mode == "contract":
|
||||
if self[start].idx < start_idx:
|
||||
start += 1
|
||||
if end_idx < self[end].idx + len(self[end]):
|
||||
end -= 1
|
||||
# if no tokens are completely within the span, return None
|
||||
if end < start:
|
||||
return None
|
||||
elif alignment_mode == "expand":
|
||||
# Don't consider the trailing whitespace to be part of the previous
|
||||
# token
|
||||
if start_idx == self[start].idx + len(self[start]):
|
||||
start += 1
|
||||
# Currently we have the token index, we want the range-end index
|
||||
end += 1
|
||||
cdef Span span = Span(self, start, end, label=label, kb_id=kb_id, vector=vector)
|
||||
|
@ -1167,23 +1192,35 @@ cdef class Doc:
|
|||
|
||||
|
||||
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
|
||||
cdef int i
|
||||
for i in range(length):
|
||||
if tokens[i].idx == start_char:
|
||||
return i
|
||||
cdef int i = token_by_char(tokens, length, start_char)
|
||||
if i >= 0 and tokens[i].idx == start_char:
|
||||
return i
|
||||
else:
|
||||
return -1
|
||||
|
||||
|
||||
cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2:
|
||||
cdef int i
|
||||
for i in range(length):
|
||||
if tokens[i].idx + tokens[i].lex.length == end_char:
|
||||
return i
|
||||
# end_char is exclusive, so find the token at one char before
|
||||
cdef int i = token_by_char(tokens, length, end_char - 1)
|
||||
if i >= 0 and tokens[i].idx + tokens[i].lex.length == end_char:
|
||||
return i
|
||||
else:
|
||||
return -1
|
||||
|
||||
|
||||
cdef int token_by_char(const TokenC* tokens, int length, int char_idx) except -2:
|
||||
cdef int start = 0, mid, end = length - 1
|
||||
while start <= end:
|
||||
mid = (start + end) / 2
|
||||
if char_idx < tokens[mid].idx:
|
||||
end = mid - 1
|
||||
elif char_idx >= tokens[mid].idx + tokens[mid].lex.length + tokens[mid].spacy:
|
||||
start = mid + 1
|
||||
else:
|
||||
return mid
|
||||
return -1
|
||||
|
||||
|
||||
cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
|
||||
cdef TokenC* head
|
||||
cdef TokenC* child
|
||||
|
|
|
@ -187,8 +187,9 @@ Remove a previously registered extension.
|
|||
|
||||
## Doc.char_span {#char_span tag="method" new="2"}
|
||||
|
||||
Create a `Span` object from the slice `doc.text[start:end]`. Returns `None` if
|
||||
the character indices don't map to a valid span.
|
||||
Create a `Span` object from the slice `doc.text[start_idx:end_idx]`. Returns
|
||||
`None` if the character indices don't map to a valid span using the default mode
|
||||
`"strict".
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -198,14 +199,15 @@ the character indices don't map to a valid span.
|
|||
> assert span.text == "New York"
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------------------------------ | ---------------------------------------- | --------------------------------------------------------------------- |
|
||||
| `start` | int | The index of the first character of the span. |
|
||||
| `end` | int | The index of the last character after the span. |
|
||||
| `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. |
|
||||
| `kb_id` <Tag variant="new">2.2</Tag> | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. |
|
||||
| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. |
|
||||
| **RETURNS** | `Span` | The newly constructed object or `None`. |
|
||||
| Name | Type | Description |
|
||||
| ------------------------------------ | ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `start_idx` | int | The index of the first character of the span. |
|
||||
| `end_idx` | int | The index of the last character after the span. |
|
||||
| `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. |
|
||||
| `kb_id` <Tag variant="new">2.2</Tag> | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. |
|
||||
| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. |
|
||||
| `mode` | `str` | How character indices snap to token boundaries. Options: "strict" (no snapping), "inside" (span of all tokens completely within the character span), "outside" (span of all tokens at least partially covered by the character span). Defaults to "strict". |
|
||||
| **RETURNS** | `Span` | The newly constructed object or `None`. |
|
||||
|
||||
## Doc.similarity {#similarity tag="method" model="vectors"}
|
||||
|
||||
|
@ -646,26 +648,26 @@ The L2 norm of the document's vector representation.
|
|||
|
||||
## Attributes {#attributes}
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `text` | unicode | A unicode representation of the document text. |
|
||||
| `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. |
|
||||
| `mem` | `Pool` | The document's local memory heap, for all C data it owns. |
|
||||
| `vocab` | `Vocab` | The store of lexical types. |
|
||||
| `tensor` <Tag variant="new">2</Tag> | `ndarray` | Container for dense vector representations. |
|
||||
| `cats` <Tag variant="new">2</Tag> | dict | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. |
|
||||
| `user_data` | - | A generic storage area, for user custom data. |
|
||||
| `lang` <Tag variant="new">2.1</Tag> | int | Language of the document's vocabulary. |
|
||||
| `lang_` <Tag variant="new">2.1</Tag> | unicode | Language of the document's vocabulary. |
|
||||
| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. |
|
||||
| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. |
|
||||
| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. |
|
||||
| `is_nered` <Tag variant="new">2.1</Tag> | bool | A flag indicating that named entities have been set. Will return `True` if the `Doc` is empty, or if _any_ of the tokens has an entity tag set, even if the others are unknown. |
|
||||
| `sentiment` | float | The document's positivity/negativity score, if available. |
|
||||
| `user_hooks` | dict | A dictionary that allows customization of the `Doc`'s properties. |
|
||||
| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. |
|
||||
| `user_span_hooks` | dict | A dictionary that allows customization of properties of `Span` children. |
|
||||
| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |
|
||||
| Name | Type | Description |
|
||||
| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `text` | unicode | A unicode representation of the document text. |
|
||||
| `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. |
|
||||
| `mem` | `Pool` | The document's local memory heap, for all C data it owns. |
|
||||
| `vocab` | `Vocab` | The store of lexical types. |
|
||||
| `tensor` <Tag variant="new">2</Tag> | `ndarray` | Container for dense vector representations. |
|
||||
| `cats` <Tag variant="new">2</Tag> | dict | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. |
|
||||
| `user_data` | - | A generic storage area, for user custom data. |
|
||||
| `lang` <Tag variant="new">2.1</Tag> | int | Language of the document's vocabulary. |
|
||||
| `lang_` <Tag variant="new">2.1</Tag> | unicode | Language of the document's vocabulary. |
|
||||
| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. |
|
||||
| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. |
|
||||
| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. |
|
||||
| `is_nered` <Tag variant="new">2.1</Tag> | bool | A flag indicating that named entities have been set. Will return `True` if the `Doc` is empty, or if _any_ of the tokens has an entity tag set, even if the others are unknown. |
|
||||
| `sentiment` | float | The document's positivity/negativity score, if available. |
|
||||
| `user_hooks` | dict | A dictionary that allows customization of the `Doc`'s properties. |
|
||||
| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. |
|
||||
| `user_span_hooks` | dict | A dictionary that allows customization of properties of `Span` children. |
|
||||
| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
|
|
Loading…
Reference in New Issue