From 3c1c0ec18ec702afbe03e24519cdb0c3a513c945 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 29 Aug 2019 14:33:39 +0200 Subject: [PATCH] Add tests for NER oracle with whitespace --- spacy/tests/parser/test_ner.py | 66 ++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 43c00a963..c39491ecf 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -91,3 +91,69 @@ def test_doc_add_entities_set_ents_iob(en_vocab): assert [w.ent_iob_ for w in doc] == ["", "", "", "B"] doc.ents = [(doc.vocab.strings["WORD"], 0, 2)] assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""] + + +def test_oracle_moves_missing_B(en_vocab): + words = ["B", "52", "Bomber"] + biluo_tags = [None, None, "L-PRODUCT"] + + doc = Doc(en_vocab, words=words) + gold = GoldParse(doc, words=words, entities=biluo_tags) + + moves = BiluoPushDown(en_vocab.strings) + move_types = ("M", "B", "I", "L", "U", "O") + for tag in biluo_tags: + if tag is None: + continue + elif tag == "O": + moves.add_action(move_types.index("O"), "") + else: + action, label = tag.split("-") + moves.add_action(move_types.index("B"), label) + moves.add_action(move_types.index("I"), label) + moves.add_action(move_types.index("L"), label) + moves.add_action(move_types.index("U"), label) + moves.preprocess_gold(gold) + seq = moves.get_oracle_sequence(doc, gold) + print(seq) + + +def test_oracle_moves_whitespace(en_vocab): + words = [ + "production", + "\n", + "of", + "Northrop", + "\n", + "Corp.", + "\n", + "'s", + "radar", + ] + biluo_tags = [ + "O", + "O", + "O", + "B-ORG", + None, + "I-ORG", + "L-ORG", + "O", + "O", + ] + + doc = Doc(en_vocab, words=words) + gold = GoldParse(doc, words=words, entities=biluo_tags) + + moves = BiluoPushDown(en_vocab.strings) + move_types = ("M", "B", "I", "L", "U", "O") + for tag in biluo_tags: + if tag is None: + continue + elif tag == "O": + moves.add_action(move_types.index("O"), "") + else: + action, label = tag.split("-") + moves.add_action(move_types.index(action), label) + moves.preprocess_gold(gold) + seq = moves.get_oracle_sequence(doc, gold)