💫 Allow Span to take text label (#3031)

Fixes #3027.

* Allow Span.__init__ to take unicode values for the `label` argument.
* Allow `Span.label_` to be writeable.

- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
This commit is contained in:
Matthew Honnibal 2018-12-08 13:08:41 +01:00 committed by Ines Montani
parent 11a29af751
commit 2c2db0c492
2 changed files with 18 additions and 3 deletions

View File

@ -3,7 +3,7 @@ from __future__ import unicode_literals
import pytest import pytest
from spacy.attrs import ORTH, LENGTH from spacy.attrs import ORTH, LENGTH
from spacy.tokens import Doc from spacy.tokens import Doc, Span
from spacy.vocab import Vocab from spacy.vocab import Vocab
from ..util import get_doc from ..util import get_doc
@ -154,6 +154,17 @@ def test_span_as_doc(doc):
assert span.text == span_doc.text.strip() assert span.text == span_doc.text.strip()
def test_span_string_label(doc):
span = Span(doc, 0, 1, label='hello')
assert span.label_ == 'hello'
assert span.label == doc.vocab.strings['hello']
def test_span_string_set_label(doc):
span = Span(doc, 0, 1)
span.label_ = 'hello'
assert span.label_ == 'hello'
assert span.label == doc.vocab.strings['hello']
def test_span_ents_property(doc): def test_span_ents_property(doc):
"""Test span.ents for the """ """Test span.ents for the """
doc.ents = [ doc.ents = [

View File

@ -15,7 +15,7 @@ from ..parts_of_speech cimport univ_pos_t
from ..util import normalize_slice from ..util import normalize_slice
from ..attrs cimport IS_PUNCT, IS_SPACE from ..attrs cimport IS_PUNCT, IS_SPACE
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from ..compat import is_config from ..compat import is_config, basestring_
from ..errors import Errors, TempErrors, Warnings, user_warning, models_warning from ..errors import Errors, TempErrors, Warnings, user_warning, models_warning
from .underscore import Underscore, get_ext_args from .underscore import Underscore, get_ext_args
@ -42,7 +42,7 @@ cdef class Span:
raise ValueError(Errors.E046.format(name=name)) raise ValueError(Errors.E046.format(name=name))
return Underscore.span_extensions.pop(name) return Underscore.span_extensions.pop(name)
def __cinit__(self, Doc doc, int start, int end, attr_t label=0, def __cinit__(self, Doc doc, int start, int end, label=0,
vector=None, vector_norm=None): vector=None, vector_norm=None):
"""Create a `Span` object from the slice `doc[start : end]`. """Create a `Span` object from the slice `doc[start : end]`.
@ -64,6 +64,8 @@ cdef class Span:
self.end_char = self.doc[end - 1].idx + len(self.doc[end - 1]) self.end_char = self.doc[end - 1].idx + len(self.doc[end - 1])
else: else:
self.end_char = 0 self.end_char = 0
if isinstance(label, basestring_):
label = doc.vocab.strings.add(label)
if label not in doc.vocab.strings: if label not in doc.vocab.strings:
raise ValueError(Errors.E084.format(label=label)) raise ValueError(Errors.E084.format(label=label))
self.label = label self.label = label
@ -601,6 +603,8 @@ cdef class Span:
"""RETURNS (unicode): The span's label.""" """RETURNS (unicode): The span's label."""
def __get__(self): def __get__(self):
return self.doc.vocab.strings[self.label] return self.doc.vocab.strings[self.label]
def __set__(self, unicode label_):
self.label = self.doc.vocab.strings.add(label_)
cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1: cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1: