2018-07-24 21:38:44 +00:00
|
|
|
# coding: utf-8
|
2017-07-21 23:48:58 +00:00
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
|
|
import pytest
|
2018-07-24 21:38:44 +00:00
|
|
|
from spacy.pipeline import EntityRecognizer
|
|
|
|
from spacy.vocab import Vocab
|
|
|
|
from spacy.syntax.ner import BiluoPushDown
|
|
|
|
from spacy.gold import GoldParse
|
|
|
|
from spacy.tokens import Doc
|
2017-07-21 23:48:58 +00:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def vocab():
|
|
|
|
return Vocab()
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def doc(vocab):
|
2018-11-27 00:09:36 +00:00
|
|
|
return Doc(vocab, words=["Casey", "went", "to", "New", "York", "."])
|
2017-07-21 23:48:58 +00:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def entity_annots(doc):
|
|
|
|
casey = doc[0:1]
|
|
|
|
ny = doc[3:5]
|
2018-11-27 00:09:36 +00:00
|
|
|
return [
|
|
|
|
(casey.start_char, casey.end_char, "PERSON"),
|
|
|
|
(ny.start_char, ny.end_char, "GPE"),
|
|
|
|
]
|
2017-07-21 23:48:58 +00:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def entity_types(entity_annots):
|
|
|
|
return sorted(set([label for (s, e, label) in entity_annots]))
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def tsys(vocab, entity_types):
|
|
|
|
actions = BiluoPushDown.get_actions(entity_types=entity_types)
|
|
|
|
return BiluoPushDown(vocab.strings, actions)
|
|
|
|
|
|
|
|
|
|
|
|
def test_get_oracle_moves(tsys, doc, entity_annots):
|
|
|
|
gold = GoldParse(doc, entities=entity_annots)
|
|
|
|
tsys.preprocess_gold(gold)
|
|
|
|
act_classes = tsys.get_oracle_sequence(doc, gold)
|
|
|
|
names = [tsys.get_class_name(act) for act in act_classes]
|
2018-11-27 00:09:36 +00:00
|
|
|
assert names == ["U-PERSON", "O", "O", "B-GPE", "L-GPE", "O"]
|
|
|
|
|
2017-07-21 23:48:58 +00:00
|
|
|
|
|
|
|
def test_get_oracle_moves_negative_entities(tsys, doc, entity_annots):
|
2018-11-27 00:09:36 +00:00
|
|
|
entity_annots = [(s, e, "!" + label) for s, e, label in entity_annots]
|
2017-07-21 23:48:58 +00:00
|
|
|
gold = GoldParse(doc, entities=entity_annots)
|
|
|
|
for i, tag in enumerate(gold.ner):
|
2018-11-27 00:09:36 +00:00
|
|
|
if tag == "L-!GPE":
|
|
|
|
gold.ner[i] = "-"
|
2017-07-21 23:48:58 +00:00
|
|
|
tsys.preprocess_gold(gold)
|
|
|
|
act_classes = tsys.get_oracle_sequence(doc, gold)
|
|
|
|
names = [tsys.get_class_name(act) for act in act_classes]
|
2018-11-30 16:43:08 +00:00
|
|
|
assert names
|
2017-07-21 23:48:58 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_get_oracle_moves_negative_entities2(tsys, vocab):
|
2018-11-27 00:09:36 +00:00
|
|
|
doc = Doc(vocab, words=["A", "B", "C", "D"])
|
2017-07-21 23:48:58 +00:00
|
|
|
gold = GoldParse(doc, entities=[])
|
2018-11-27 00:09:36 +00:00
|
|
|
gold.ner = ["B-!PERSON", "L-!PERSON", "B-!PERSON", "L-!PERSON"]
|
2017-07-21 23:48:58 +00:00
|
|
|
tsys.preprocess_gold(gold)
|
|
|
|
act_classes = tsys.get_oracle_sequence(doc, gold)
|
|
|
|
names = [tsys.get_class_name(act) for act in act_classes]
|
2018-11-30 16:43:08 +00:00
|
|
|
assert names
|
2017-07-21 23:48:58 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_get_oracle_moves_negative_O(tsys, vocab):
|
2018-11-27 00:09:36 +00:00
|
|
|
doc = Doc(vocab, words=["A", "B", "C", "D"])
|
2017-07-21 23:48:58 +00:00
|
|
|
gold = GoldParse(doc, entities=[])
|
2018-11-27 00:09:36 +00:00
|
|
|
gold.ner = ["O", "!O", "O", "!O"]
|
2017-07-21 23:48:58 +00:00
|
|
|
tsys.preprocess_gold(gold)
|
|
|
|
act_classes = tsys.get_oracle_sequence(doc, gold)
|
|
|
|
names = [tsys.get_class_name(act) for act in act_classes]
|
2018-11-30 16:43:08 +00:00
|
|
|
assert names
|
2018-07-24 21:38:44 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_doc_add_entities_set_ents_iob(en_vocab):
|
|
|
|
doc = Doc(en_vocab, words=["This", "is", "a", "lion"])
|
|
|
|
ner = EntityRecognizer(en_vocab)
|
|
|
|
ner.begin_training([])
|
|
|
|
ner(doc)
|
|
|
|
assert len(list(doc.ents)) == 0
|
2018-11-27 00:09:36 +00:00
|
|
|
assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))
|
|
|
|
doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
|
|
|
|
assert [w.ent_iob_ for w in doc] == ["", "", "", "B"]
|
|
|
|
doc.ents = [(doc.vocab.strings["WORD"], 0, 2)]
|
|
|
|
assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""]
|
2019-08-29 12:33:39 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_oracle_moves_missing_B(en_vocab):
|
|
|
|
words = ["B", "52", "Bomber"]
|
|
|
|
biluo_tags = [None, None, "L-PRODUCT"]
|
|
|
|
|
|
|
|
doc = Doc(en_vocab, words=words)
|
|
|
|
gold = GoldParse(doc, words=words, entities=biluo_tags)
|
|
|
|
|
|
|
|
moves = BiluoPushDown(en_vocab.strings)
|
|
|
|
move_types = ("M", "B", "I", "L", "U", "O")
|
|
|
|
for tag in biluo_tags:
|
|
|
|
if tag is None:
|
|
|
|
continue
|
|
|
|
elif tag == "O":
|
|
|
|
moves.add_action(move_types.index("O"), "")
|
|
|
|
else:
|
|
|
|
action, label = tag.split("-")
|
|
|
|
moves.add_action(move_types.index("B"), label)
|
|
|
|
moves.add_action(move_types.index("I"), label)
|
|
|
|
moves.add_action(move_types.index("L"), label)
|
|
|
|
moves.add_action(move_types.index("U"), label)
|
|
|
|
moves.preprocess_gold(gold)
|
|
|
|
seq = moves.get_oracle_sequence(doc, gold)
|
|
|
|
print(seq)
|
|
|
|
|
|
|
|
|
|
|
|
def test_oracle_moves_whitespace(en_vocab):
|
2019-09-11 12:00:36 +00:00
|
|
|
words = ["production", "\n", "of", "Northrop", "\n", "Corp.", "\n", "'s", "radar"]
|
|
|
|
biluo_tags = ["O", "O", "O", "B-ORG", None, "I-ORG", "L-ORG", "O", "O"]
|
2019-08-29 12:33:39 +00:00
|
|
|
|
|
|
|
doc = Doc(en_vocab, words=words)
|
|
|
|
gold = GoldParse(doc, words=words, entities=biluo_tags)
|
|
|
|
|
|
|
|
moves = BiluoPushDown(en_vocab.strings)
|
|
|
|
move_types = ("M", "B", "I", "L", "U", "O")
|
|
|
|
for tag in biluo_tags:
|
|
|
|
if tag is None:
|
|
|
|
continue
|
|
|
|
elif tag == "O":
|
|
|
|
moves.add_action(move_types.index("O"), "")
|
|
|
|
else:
|
|
|
|
action, label = tag.split("-")
|
|
|
|
moves.add_action(move_types.index(action), label)
|
|
|
|
moves.preprocess_gold(gold)
|
2019-09-11 12:00:36 +00:00
|
|
|
moves.get_oracle_sequence(doc, gold)
|