mirror of https://github.com/explosion/spaCy.git
Add gold.spans_from_biluo_tags helper (#3227)
This commit is contained in:
parent
5e16490d9d
commit
f25bd9f5e4
|
@ -12,7 +12,7 @@ import srsly
|
||||||
|
|
||||||
from . import _align
|
from . import _align
|
||||||
from .syntax import nonproj
|
from .syntax import nonproj
|
||||||
from .tokens import Doc
|
from .tokens import Doc, Span
|
||||||
from .errors import Errors
|
from .errors import Errors
|
||||||
from . import util
|
from . import util
|
||||||
from .util import minibatch, itershuffle
|
from .util import minibatch, itershuffle
|
||||||
|
@ -659,6 +659,24 @@ def biluo_tags_from_offsets(doc, entities, missing='O'):
|
||||||
return biluo
|
return biluo
|
||||||
|
|
||||||
|
|
||||||
|
def spans_from_biluo_tags(doc, tags):
|
||||||
|
"""Encode per-token tags following the BILUO scheme into Span object, e.g.
|
||||||
|
to overwrite the doc.ents.
|
||||||
|
|
||||||
|
doc (Doc): The document that the BILUO tags refer to.
|
||||||
|
entities (iterable): A sequence of BILUO tags with each tag describing one
|
||||||
|
token. Each tags string will be of the form of either "", "O" or
|
||||||
|
"{action}-{label}", where action is one of "B", "I", "L", "U".
|
||||||
|
RETURNS (list): A sequence of Span objects.
|
||||||
|
"""
|
||||||
|
token_offsets = tags_to_entities(tags)
|
||||||
|
spans = []
|
||||||
|
for label, start_idx, end_idx in token_offsets:
|
||||||
|
span = Span(doc, start_idx, end_idx + 1, label=label)
|
||||||
|
spans.append(span)
|
||||||
|
return spans
|
||||||
|
|
||||||
|
|
||||||
def offsets_from_biluo_tags(doc, tags):
|
def offsets_from_biluo_tags(doc, tags):
|
||||||
"""Encode per-token tags following the BILUO scheme into entity offsets.
|
"""Encode per-token tags following the BILUO scheme into entity offsets.
|
||||||
|
|
||||||
|
@ -670,12 +688,8 @@ def offsets_from_biluo_tags(doc, tags):
|
||||||
`end` will be character-offset integers denoting the slice into the
|
`end` will be character-offset integers denoting the slice into the
|
||||||
original string.
|
original string.
|
||||||
"""
|
"""
|
||||||
token_offsets = tags_to_entities(tags)
|
spans = spans_from_biluo_tags(doc, tags)
|
||||||
offsets = []
|
return [(span.start_char, span.end_char, span.label_) for span in spans]
|
||||||
for label, start_idx, end_idx in token_offsets:
|
|
||||||
span = doc[start_idx : end_idx + 1]
|
|
||||||
offsets.append((span.start_char, span.end_char, label))
|
|
||||||
return offsets
|
|
||||||
|
|
||||||
|
|
||||||
def is_punct_label(label):
|
def is_punct_label(label):
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
|
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
|
||||||
|
from spacy.gold import spans_from_biluo_tags
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
|
@ -50,3 +51,14 @@ def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
|
||||||
assert biluo_tags_converted == biluo_tags
|
assert biluo_tags_converted == biluo_tags
|
||||||
offsets_converted = offsets_from_biluo_tags(doc, biluo_tags)
|
offsets_converted = offsets_from_biluo_tags(doc, biluo_tags)
|
||||||
assert offsets_converted == offsets
|
assert offsets_converted == offsets
|
||||||
|
|
||||||
|
|
||||||
|
def test_biluo_spans(en_tokenizer):
|
||||||
|
doc = en_tokenizer("I flew to Silicon Valley via London.")
|
||||||
|
biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
|
||||||
|
spans = spans_from_biluo_tags(doc, biluo_tags)
|
||||||
|
assert len(spans) == 2
|
||||||
|
assert spans[0].text == "Silicon Valley"
|
||||||
|
assert spans[0].label_ == "LOC"
|
||||||
|
assert spans[1].text == "London"
|
||||||
|
assert spans[1].label_ == "GPE"
|
||||||
|
|
Loading…
Reference in New Issue