From c62fd878a38fa0ce16243022b5dab5d043aaf31f Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 4 Aug 2020 13:36:32 +0200 Subject: [PATCH] Allow Doc.char_span to snap to token boundaries (#5849) * Allow Doc.char_span to snap to token boundaries Add a `mode` option to allow `Doc.char_span` to snap to token boundaries. The `mode` options: * `strict`: character offsets must match token boundaries (default, same as before) * `inside`: all tokens completely within the character span * `outside`: all tokens at least partially covered by the character span Add a new helper function `token_by_char` that returns the token corresponding to a character position in the text. Update `token_by_start` and `token_by_end` to use `token_by_char` for more efficient searching. * Remove unused import * Rename mode to alignment_mode Rename `mode` to `alignment_mode` with the options `strict`/`contract`/`expand`. Any unrecognized modes are silently converted to `strict`. --- spacy/tests/doc/test_span.py | 19 ++++++++++ spacy/tokens/doc.pyx | 71 +++++++++++++++++++++++++++--------- website/docs/api/doc.md | 62 ++++++++++++++++--------------- 3 files changed, 105 insertions(+), 47 deletions(-) diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 25fa421b7..107078df9 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -167,11 +167,30 @@ def test_spans_are_hashable(en_tokenizer): def test_spans_by_character(doc): span1 = doc[1:-2] + + # default and specified alignment mode "strict" span2 = doc.char_span(span1.start_char, span1.end_char, label="GPE") assert span1.start_char == span2.start_char assert span1.end_char == span2.end_char assert span2.label_ == "GPE" + span2 = doc.char_span(span1.start_char, span1.end_char, label="GPE", alignment_mode="strict") + assert span1.start_char == span2.start_char + assert span1.end_char == span2.end_char + assert span2.label_ == "GPE" + + # alignment mode "contract" + span2 = doc.char_span(span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract") + assert span1.start_char == span2.start_char + assert span1.end_char == span2.end_char + assert span2.label_ == "GPE" + + # alignment mode "expand" + span2 = doc.char_span(span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="expand") + assert span1.start_char == span2.start_char + assert span1.end_char == span2.end_char + assert span2.label_ == "GPE" + def test_span_to_array(doc): span = doc[1:-2] diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 5b03dc5d2..89573ba09 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -352,17 +352,25 @@ cdef class Doc: def doc(self): return self - def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None): - """Create a `Span` object from the slice `doc.text[start : end]`. + def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict"): + """Create a `Span` object from the slice + `doc.text[start_idx : end_idx]`. Returns None if no valid `Span` can be + created. doc (Doc): The parent document. - start (int): The index of the first character of the span. - end (int): The index of the first character after the span. + start_idx (int): The index of the first character of the span. + end_idx (int): The index of the first character after the span. label (uint64 or string): A label to attach to the Span, e.g. for named entities. - kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity. + kb_id (uint64 or string): An ID from a KB to capture the meaning of a + named entity. vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. + alignment_mode (str): How character indices are aligned to token + boundaries. Options: "strict" (character indices must be aligned + with token boundaries), "contract" (span of all tokens completely + within the character span), "expand" (span of all tokens at least + partially covered by the character span). Defaults to "strict". RETURNS (Span): The newly constructed object. DOCS: https://spacy.io/api/doc#char_span @@ -371,12 +379,29 @@ cdef class Doc: label = self.vocab.strings.add(label) if not isinstance(kb_id, int): kb_id = self.vocab.strings.add(kb_id) - cdef int start = token_by_start(self.c, self.length, start_idx) - if start == -1: + if alignment_mode not in ("strict", "contract", "expand"): + alignment_mode = "strict" + cdef int start = token_by_char(self.c, self.length, start_idx) + if start < 0 or (alignment_mode == "strict" and start_idx != self[start].idx): return None - cdef int end = token_by_end(self.c, self.length, end_idx) - if end == -1: + # end_idx is exclusive, so find the token at one char before + cdef int end = token_by_char(self.c, self.length, end_idx - 1) + if end < 0 or (alignment_mode == "strict" and end_idx != self[end].idx + len(self[end])): return None + # Adjust start and end by alignment_mode + if alignment_mode == "contract": + if self[start].idx < start_idx: + start += 1 + if end_idx < self[end].idx + len(self[end]): + end -= 1 + # if no tokens are completely within the span, return None + if end < start: + return None + elif alignment_mode == "expand": + # Don't consider the trailing whitespace to be part of the previous + # token + if start_idx == self[start].idx + len(self[start]): + start += 1 # Currently we have the token index, we want the range-end index end += 1 cdef Span span = Span(self, start, end, label=label, kb_id=kb_id, vector=vector) @@ -1167,23 +1192,35 @@ cdef class Doc: cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2: - cdef int i - for i in range(length): - if tokens[i].idx == start_char: - return i + cdef int i = token_by_char(tokens, length, start_char) + if i >= 0 and tokens[i].idx == start_char: + return i else: return -1 cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2: - cdef int i - for i in range(length): - if tokens[i].idx + tokens[i].lex.length == end_char: - return i + # end_char is exclusive, so find the token at one char before + cdef int i = token_by_char(tokens, length, end_char - 1) + if i >= 0 and tokens[i].idx + tokens[i].lex.length == end_char: + return i else: return -1 +cdef int token_by_char(const TokenC* tokens, int length, int char_idx) except -2: + cdef int start = 0, mid, end = length - 1 + while start <= end: + mid = (start + end) / 2 + if char_idx < tokens[mid].idx: + end = mid - 1 + elif char_idx >= tokens[mid].idx + tokens[mid].lex.length + tokens[mid].spacy: + start = mid + 1 + else: + return mid + return -1 + + cdef int set_children_from_heads(TokenC* tokens, int length) except -1: cdef TokenC* head cdef TokenC* child diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 7decc2278..420e12fcb 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -187,8 +187,9 @@ Remove a previously registered extension. ## Doc.char_span {#char_span tag="method" new="2"} -Create a `Span` object from the slice `doc.text[start:end]`. Returns `None` if -the character indices don't map to a valid span. +Create a `Span` object from the slice `doc.text[start_idx:end_idx]`. Returns +`None` if the character indices don't map to a valid span using the default mode +`"strict". > #### Example > @@ -198,14 +199,15 @@ the character indices don't map to a valid span. > assert span.text == "New York" > ``` -| Name | Type | Description | -| ------------------------------------ | ---------------------------------------- | --------------------------------------------------------------------- | -| `start` | int | The index of the first character of the span. | -| `end` | int | The index of the last character after the span. | -| `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. | -| `kb_id` 2.2 | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. | -| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. | -| **RETURNS** | `Span` | The newly constructed object or `None`. | +| Name | Type | Description | +| ------------------------------------ | ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `start_idx` | int | The index of the first character of the span. | +| `end_idx` | int | The index of the last character after the span. | +| `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. | +| `kb_id` 2.2 | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. | +| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. | +| `mode` | `str` | How character indices snap to token boundaries. Options: "strict" (no snapping), "inside" (span of all tokens completely within the character span), "outside" (span of all tokens at least partially covered by the character span). Defaults to "strict". | +| **RETURNS** | `Span` | The newly constructed object or `None`. | ## Doc.similarity {#similarity tag="method" model="vectors"} @@ -646,26 +648,26 @@ The L2 norm of the document's vector representation. ## Attributes {#attributes} -| Name | Type | Description | -| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `text` | unicode | A unicode representation of the document text. | -| `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. | -| `mem` | `Pool` | The document's local memory heap, for all C data it owns. | -| `vocab` | `Vocab` | The store of lexical types. | -| `tensor` 2 | `ndarray` | Container for dense vector representations. | -| `cats` 2 | dict | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. | -| `user_data` | - | A generic storage area, for user custom data. | -| `lang` 2.1 | int | Language of the document's vocabulary. | -| `lang_` 2.1 | unicode | Language of the document's vocabulary. | -| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. | -| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. | -| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. | -| `is_nered` 2.1 | bool | A flag indicating that named entities have been set. Will return `True` if the `Doc` is empty, or if _any_ of the tokens has an entity tag set, even if the others are unknown. | -| `sentiment` | float | The document's positivity/negativity score, if available. | -| `user_hooks` | dict | A dictionary that allows customization of the `Doc`'s properties. | -| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. | -| `user_span_hooks` | dict | A dictionary that allows customization of properties of `Span` children. | -| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | +| Name | Type | Description | +| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `text` | unicode | A unicode representation of the document text. | +| `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. | +| `mem` | `Pool` | The document's local memory heap, for all C data it owns. | +| `vocab` | `Vocab` | The store of lexical types. | +| `tensor` 2 | `ndarray` | Container for dense vector representations. | +| `cats` 2 | dict | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. | +| `user_data` | - | A generic storage area, for user custom data. | +| `lang` 2.1 | int | Language of the document's vocabulary. | +| `lang_` 2.1 | unicode | Language of the document's vocabulary. | +| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. | +| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. | +| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. | +| `is_nered` 2.1 | bool | A flag indicating that named entities have been set. Will return `True` if the `Doc` is empty, or if _any_ of the tokens has an entity tag set, even if the others are unknown. | +| `sentiment` | float | The document's positivity/negativity score, if available. | +| `user_hooks` | dict | A dictionary that allows customization of the `Doc`'s properties. | +| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. | +| `user_span_hooks` | dict | A dictionary that allows customization of properties of `Span` children. | +| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | ## Serialization fields {#serialization-fields}