mirror of https://github.com/explosion/spaCy.git
Allow Doc.char_span to snap to token boundaries (#5849)
* Allow Doc.char_span to snap to token boundaries Add a `mode` option to allow `Doc.char_span` to snap to token boundaries. The `mode` options: * `strict`: character offsets must match token boundaries (default, same as before) * `inside`: all tokens completely within the character span * `outside`: all tokens at least partially covered by the character span Add a new helper function `token_by_char` that returns the token corresponding to a character position in the text. Update `token_by_start` and `token_by_end` to use `token_by_char` for more efficient searching. * Remove unused import * Rename mode to alignment_mode Rename `mode` to `alignment_mode` with the options `strict`/`contract`/`expand`. Any unrecognized modes are silently converted to `strict`.
This commit is contained in:
parent
b841248589
commit
c62fd878a3
|
@ -167,11 +167,30 @@ def test_spans_are_hashable(en_tokenizer):
|
||||||
|
|
||||||
def test_spans_by_character(doc):
|
def test_spans_by_character(doc):
|
||||||
span1 = doc[1:-2]
|
span1 = doc[1:-2]
|
||||||
|
|
||||||
|
# default and specified alignment mode "strict"
|
||||||
span2 = doc.char_span(span1.start_char, span1.end_char, label="GPE")
|
span2 = doc.char_span(span1.start_char, span1.end_char, label="GPE")
|
||||||
assert span1.start_char == span2.start_char
|
assert span1.start_char == span2.start_char
|
||||||
assert span1.end_char == span2.end_char
|
assert span1.end_char == span2.end_char
|
||||||
assert span2.label_ == "GPE"
|
assert span2.label_ == "GPE"
|
||||||
|
|
||||||
|
span2 = doc.char_span(span1.start_char, span1.end_char, label="GPE", alignment_mode="strict")
|
||||||
|
assert span1.start_char == span2.start_char
|
||||||
|
assert span1.end_char == span2.end_char
|
||||||
|
assert span2.label_ == "GPE"
|
||||||
|
|
||||||
|
# alignment mode "contract"
|
||||||
|
span2 = doc.char_span(span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract")
|
||||||
|
assert span1.start_char == span2.start_char
|
||||||
|
assert span1.end_char == span2.end_char
|
||||||
|
assert span2.label_ == "GPE"
|
||||||
|
|
||||||
|
# alignment mode "expand"
|
||||||
|
span2 = doc.char_span(span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="expand")
|
||||||
|
assert span1.start_char == span2.start_char
|
||||||
|
assert span1.end_char == span2.end_char
|
||||||
|
assert span2.label_ == "GPE"
|
||||||
|
|
||||||
|
|
||||||
def test_span_to_array(doc):
|
def test_span_to_array(doc):
|
||||||
span = doc[1:-2]
|
span = doc[1:-2]
|
||||||
|
|
|
@ -352,17 +352,25 @@ cdef class Doc:
|
||||||
def doc(self):
|
def doc(self):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None):
|
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict"):
|
||||||
"""Create a `Span` object from the slice `doc.text[start : end]`.
|
"""Create a `Span` object from the slice
|
||||||
|
`doc.text[start_idx : end_idx]`. Returns None if no valid `Span` can be
|
||||||
|
created.
|
||||||
|
|
||||||
doc (Doc): The parent document.
|
doc (Doc): The parent document.
|
||||||
start (int): The index of the first character of the span.
|
start_idx (int): The index of the first character of the span.
|
||||||
end (int): The index of the first character after the span.
|
end_idx (int): The index of the first character after the span.
|
||||||
label (uint64 or string): A label to attach to the Span, e.g. for
|
label (uint64 or string): A label to attach to the Span, e.g. for
|
||||||
named entities.
|
named entities.
|
||||||
kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity.
|
kb_id (uint64 or string): An ID from a KB to capture the meaning of a
|
||||||
|
named entity.
|
||||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
|
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
|
||||||
the span.
|
the span.
|
||||||
|
alignment_mode (str): How character indices are aligned to token
|
||||||
|
boundaries. Options: "strict" (character indices must be aligned
|
||||||
|
with token boundaries), "contract" (span of all tokens completely
|
||||||
|
within the character span), "expand" (span of all tokens at least
|
||||||
|
partially covered by the character span). Defaults to "strict".
|
||||||
RETURNS (Span): The newly constructed object.
|
RETURNS (Span): The newly constructed object.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#char_span
|
DOCS: https://spacy.io/api/doc#char_span
|
||||||
|
@ -371,12 +379,29 @@ cdef class Doc:
|
||||||
label = self.vocab.strings.add(label)
|
label = self.vocab.strings.add(label)
|
||||||
if not isinstance(kb_id, int):
|
if not isinstance(kb_id, int):
|
||||||
kb_id = self.vocab.strings.add(kb_id)
|
kb_id = self.vocab.strings.add(kb_id)
|
||||||
cdef int start = token_by_start(self.c, self.length, start_idx)
|
if alignment_mode not in ("strict", "contract", "expand"):
|
||||||
if start == -1:
|
alignment_mode = "strict"
|
||||||
|
cdef int start = token_by_char(self.c, self.length, start_idx)
|
||||||
|
if start < 0 or (alignment_mode == "strict" and start_idx != self[start].idx):
|
||||||
return None
|
return None
|
||||||
cdef int end = token_by_end(self.c, self.length, end_idx)
|
# end_idx is exclusive, so find the token at one char before
|
||||||
if end == -1:
|
cdef int end = token_by_char(self.c, self.length, end_idx - 1)
|
||||||
|
if end < 0 or (alignment_mode == "strict" and end_idx != self[end].idx + len(self[end])):
|
||||||
return None
|
return None
|
||||||
|
# Adjust start and end by alignment_mode
|
||||||
|
if alignment_mode == "contract":
|
||||||
|
if self[start].idx < start_idx:
|
||||||
|
start += 1
|
||||||
|
if end_idx < self[end].idx + len(self[end]):
|
||||||
|
end -= 1
|
||||||
|
# if no tokens are completely within the span, return None
|
||||||
|
if end < start:
|
||||||
|
return None
|
||||||
|
elif alignment_mode == "expand":
|
||||||
|
# Don't consider the trailing whitespace to be part of the previous
|
||||||
|
# token
|
||||||
|
if start_idx == self[start].idx + len(self[start]):
|
||||||
|
start += 1
|
||||||
# Currently we have the token index, we want the range-end index
|
# Currently we have the token index, we want the range-end index
|
||||||
end += 1
|
end += 1
|
||||||
cdef Span span = Span(self, start, end, label=label, kb_id=kb_id, vector=vector)
|
cdef Span span = Span(self, start, end, label=label, kb_id=kb_id, vector=vector)
|
||||||
|
@ -1167,23 +1192,35 @@ cdef class Doc:
|
||||||
|
|
||||||
|
|
||||||
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
|
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
|
||||||
cdef int i
|
cdef int i = token_by_char(tokens, length, start_char)
|
||||||
for i in range(length):
|
if i >= 0 and tokens[i].idx == start_char:
|
||||||
if tokens[i].idx == start_char:
|
|
||||||
return i
|
return i
|
||||||
else:
|
else:
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
|
|
||||||
cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2:
|
cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2:
|
||||||
cdef int i
|
# end_char is exclusive, so find the token at one char before
|
||||||
for i in range(length):
|
cdef int i = token_by_char(tokens, length, end_char - 1)
|
||||||
if tokens[i].idx + tokens[i].lex.length == end_char:
|
if i >= 0 and tokens[i].idx + tokens[i].lex.length == end_char:
|
||||||
return i
|
return i
|
||||||
else:
|
else:
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
|
|
||||||
|
cdef int token_by_char(const TokenC* tokens, int length, int char_idx) except -2:
|
||||||
|
cdef int start = 0, mid, end = length - 1
|
||||||
|
while start <= end:
|
||||||
|
mid = (start + end) / 2
|
||||||
|
if char_idx < tokens[mid].idx:
|
||||||
|
end = mid - 1
|
||||||
|
elif char_idx >= tokens[mid].idx + tokens[mid].lex.length + tokens[mid].spacy:
|
||||||
|
start = mid + 1
|
||||||
|
else:
|
||||||
|
return mid
|
||||||
|
return -1
|
||||||
|
|
||||||
|
|
||||||
cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
|
cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
|
||||||
cdef TokenC* head
|
cdef TokenC* head
|
||||||
cdef TokenC* child
|
cdef TokenC* child
|
||||||
|
|
|
@ -187,8 +187,9 @@ Remove a previously registered extension.
|
||||||
|
|
||||||
## Doc.char_span {#char_span tag="method" new="2"}
|
## Doc.char_span {#char_span tag="method" new="2"}
|
||||||
|
|
||||||
Create a `Span` object from the slice `doc.text[start:end]`. Returns `None` if
|
Create a `Span` object from the slice `doc.text[start_idx:end_idx]`. Returns
|
||||||
the character indices don't map to a valid span.
|
`None` if the character indices don't map to a valid span using the default mode
|
||||||
|
`"strict".
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -199,12 +200,13 @@ the character indices don't map to a valid span.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------------------------------ | ---------------------------------------- | --------------------------------------------------------------------- |
|
| ------------------------------------ | ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `start` | int | The index of the first character of the span. |
|
| `start_idx` | int | The index of the first character of the span. |
|
||||||
| `end` | int | The index of the last character after the span. |
|
| `end_idx` | int | The index of the last character after the span. |
|
||||||
| `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. |
|
| `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. |
|
||||||
| `kb_id` <Tag variant="new">2.2</Tag> | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. |
|
| `kb_id` <Tag variant="new">2.2</Tag> | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. |
|
||||||
| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. |
|
| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. |
|
||||||
|
| `mode` | `str` | How character indices snap to token boundaries. Options: "strict" (no snapping), "inside" (span of all tokens completely within the character span), "outside" (span of all tokens at least partially covered by the character span). Defaults to "strict". |
|
||||||
| **RETURNS** | `Span` | The newly constructed object or `None`. |
|
| **RETURNS** | `Span` | The newly constructed object or `None`. |
|
||||||
|
|
||||||
## Doc.similarity {#similarity tag="method" model="vectors"}
|
## Doc.similarity {#similarity tag="method" model="vectors"}
|
||||||
|
@ -647,7 +649,7 @@ The L2 norm of the document's vector representation.
|
||||||
## Attributes {#attributes}
|
## Attributes {#attributes}
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `text` | unicode | A unicode representation of the document text. |
|
| `text` | unicode | A unicode representation of the document text. |
|
||||||
| `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. |
|
| `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. |
|
||||||
| `mem` | `Pool` | The document's local memory heap, for all C data it owns. |
|
| `mem` | `Pool` | The document's local memory heap, for all C data it owns. |
|
||||||
|
|
Loading…
Reference in New Issue