mirror of https://github.com/explosion/spaCy.git
NER align tests (#5656)
* one_to_man works better. misalignment doesn't yet. * fix tests * restore example * xfail alignment tests
This commit is contained in:
parent
2d9604d39c
commit
fc3cb1fa9e
|
@ -47,7 +47,7 @@ cdef class Example:
|
||||||
|
|
||||||
def __set__(self, doc):
|
def __set__(self, doc):
|
||||||
self.x = doc
|
self.x = doc
|
||||||
|
|
||||||
property reference:
|
property reference:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.y
|
return self.y
|
||||||
|
@ -60,7 +60,7 @@ cdef class Example:
|
||||||
self.x.copy(),
|
self.x.copy(),
|
||||||
self.y.copy()
|
self.y.copy()
|
||||||
)
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_dict(cls, Doc predicted, dict example_dict):
|
def from_dict(cls, Doc predicted, dict example_dict):
|
||||||
if example_dict is None:
|
if example_dict is None:
|
||||||
|
@ -78,7 +78,7 @@ cdef class Example:
|
||||||
predicted,
|
predicted,
|
||||||
annotations2doc(predicted.vocab, tok_dict, doc_dict)
|
annotations2doc(predicted.vocab, tok_dict, doc_dict)
|
||||||
)
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def alignment(self):
|
def alignment(self):
|
||||||
if self._alignment is None:
|
if self._alignment is None:
|
||||||
|
@ -151,7 +151,7 @@ cdef class Example:
|
||||||
x_text = self.x.text[end_char:]
|
x_text = self.x.text[end_char:]
|
||||||
x_text_offset = end_char
|
x_text_offset = end_char
|
||||||
x_tags = biluo_tags_from_offsets(
|
x_tags = biluo_tags_from_offsets(
|
||||||
self.x,
|
self.x,
|
||||||
[(e.start_char, e.end_char, e.label_) for e in x_spans],
|
[(e.start_char, e.end_char, e.label_) for e in x_spans],
|
||||||
missing=None
|
missing=None
|
||||||
)
|
)
|
||||||
|
|
|
@ -230,14 +230,13 @@ def test_json2docs_no_ner(en_vocab):
|
||||||
Doc(
|
Doc(
|
||||||
doc.vocab,
|
doc.vocab,
|
||||||
words=[w.text for w in doc],
|
words=[w.text for w in doc],
|
||||||
spaces=[bool(w.whitespace_) for w in doc]
|
spaces=[bool(w.whitespace_) for w in doc],
|
||||||
),
|
),
|
||||||
doc
|
doc,
|
||||||
)
|
)
|
||||||
ner_tags = eg.get_aligned_ner()
|
ner_tags = eg.get_aligned_ner()
|
||||||
assert ner_tags == [None, None, None, None, None]
|
assert ner_tags == [None, None, None, None, None]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def test_split_sentences(en_vocab):
|
def test_split_sentences(en_vocab):
|
||||||
words = ["I", "flew", "to", "San Francisco Valley", "had", "loads of fun"]
|
words = ["I", "flew", "to", "San Francisco Valley", "had", "loads of fun"]
|
||||||
|
@ -283,8 +282,8 @@ def test_split_sentences(en_vocab):
|
||||||
assert split_examples[1].text == "had loads of fun "
|
assert split_examples[1].text == "had loads of fun "
|
||||||
|
|
||||||
|
|
||||||
def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
|
@pytest.mark.xfail(reason="Alignment should be fixed after example refactor")
|
||||||
# one-to-many
|
def test_gold_biluo_one_to_many(en_vocab, en_tokenizer):
|
||||||
words = ["I", "flew to", "San Francisco Valley", "."]
|
words = ["I", "flew to", "San Francisco Valley", "."]
|
||||||
spaces = [True, True, False, False]
|
spaces = [True, True, False, False]
|
||||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
|
@ -292,9 +291,28 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
|
||||||
gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
|
gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
|
assert ner_tags == ["O", "O", "U-LOC", "O"]
|
||||||
|
|
||||||
|
entities = [
|
||||||
|
(len("I "), len("I flew to"), "ORG"),
|
||||||
|
(len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
|
||||||
|
]
|
||||||
|
gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||||
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
|
ner_tags = example.get_aligned_ner()
|
||||||
|
assert ner_tags == ["O", "U-ORG", "U-LOC", "O"]
|
||||||
|
|
||||||
|
entities = [
|
||||||
|
(len("I "), len("I flew"), "ORG"),
|
||||||
|
(len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
|
||||||
|
]
|
||||||
|
gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||||
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == ["O", None, "U-LOC", "O"]
|
assert ner_tags == ["O", None, "U-LOC", "O"]
|
||||||
|
|
||||||
# many-to-one
|
|
||||||
|
def test_gold_biluo_many_to_one(en_vocab, en_tokenizer):
|
||||||
words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
|
words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||||
spaces = [True, True, True, True, True, False, False]
|
spaces = [True, True, True, True, True, False, False]
|
||||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
|
@ -304,31 +322,38 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
|
assert ner_tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
|
||||||
|
|
||||||
# misaligned
|
entities = [
|
||||||
|
(len("I "), len("I flew to"), "ORG"),
|
||||||
|
(len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
|
||||||
|
]
|
||||||
|
gold_words = ["I", "flew to", "San Francisco Valley", "."]
|
||||||
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
|
ner_tags = example.get_aligned_ner()
|
||||||
|
assert ner_tags == ["O", "B-ORG", "L-ORG", "B-LOC", "I-LOC", "L-LOC", "O"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail(reason="Alignment should be fixed after example refactor")
|
||||||
|
def test_gold_biluo_misaligned(en_vocab, en_tokenizer):
|
||||||
words = ["I flew", "to", "San Francisco", "Valley", "."]
|
words = ["I flew", "to", "San Francisco", "Valley", "."]
|
||||||
spaces = [True, True, True, False, False]
|
spaces = [True, True, True, False, False]
|
||||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
offset_start = len("I flew to ")
|
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
||||||
offset_end = len("I flew to San Francisco Valley")
|
|
||||||
entities = [(offset_start, offset_end, "LOC")]
|
|
||||||
links = {(offset_start, offset_end): {"Q816843": 1.0}}
|
|
||||||
gold_words = ["I", "flew to", "San", "Francisco Valley", "."]
|
gold_words = ["I", "flew to", "San", "Francisco Valley", "."]
|
||||||
example = Example.from_dict(
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
doc, {"words": gold_words, "entities": entities, "links": links}
|
|
||||||
)
|
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == [None, "O", "B-LOC", "L-LOC", "O"]
|
assert ner_tags == ["O", "O", "B-LOC", "L-LOC", "O"]
|
||||||
#assert example.get_aligned("ENT_KB_ID", as_string=True) == [
|
|
||||||
# "",
|
|
||||||
# "",
|
|
||||||
# "Q816843",
|
|
||||||
# "Q816843",
|
|
||||||
# "",
|
|
||||||
#]
|
|
||||||
#assert example.to_dict()["doc_annotation"]["links"][(offset_start, offset_end)] == {
|
|
||||||
# "Q816843": 1.0
|
|
||||||
#}
|
|
||||||
|
|
||||||
|
entities = [
|
||||||
|
(len("I "), len("I flew to"), "ORG"),
|
||||||
|
(len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
|
||||||
|
]
|
||||||
|
gold_words = ["I", "flew to", "San", "Francisco Valley", "."]
|
||||||
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
|
ner_tags = example.get_aligned_ner()
|
||||||
|
assert ner_tags == [None, None, "B-LOC", "L-LOC", "O"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_gold_biluo_additional_whitespace(en_vocab, en_tokenizer):
|
||||||
# additional whitespace tokens in GoldParse words
|
# additional whitespace tokens in GoldParse words
|
||||||
words, spaces = get_words_and_spaces(
|
words, spaces = get_words_and_spaces(
|
||||||
["I", "flew", "to", "San Francisco", "Valley", "."],
|
["I", "flew", "to", "San Francisco", "Valley", "."],
|
||||||
|
@ -344,7 +369,8 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"]
|
assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"]
|
||||||
|
|
||||||
# from issue #4791
|
|
||||||
|
def test_gold_biluo_4791(en_vocab, en_tokenizer):
|
||||||
doc = en_tokenizer("I'll return the ₹54 amount")
|
doc = en_tokenizer("I'll return the ₹54 amount")
|
||||||
gold_words = ["I", "'ll", "return", "the", "₹", "54", "amount"]
|
gold_words = ["I", "'ll", "return", "the", "₹", "54", "amount"]
|
||||||
gold_spaces = [False, True, True, True, False, True, False]
|
gold_spaces = [False, True, True, True, False, True, False]
|
||||||
|
@ -593,7 +619,6 @@ def test_tuple_format_implicit_invalid():
|
||||||
_train(train_data)
|
_train(train_data)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _train(train_data):
|
def _train(train_data):
|
||||||
nlp = English()
|
nlp = English()
|
||||||
ner = nlp.create_pipe("ner")
|
ner = nlp.create_pipe("ner")
|
||||||
|
|
Loading…
Reference in New Issue