From 13a8595a4b01d248a4d5659ec728b6fcc0fdcc4a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 24 May 2015 21:45:57 +0200 Subject: [PATCH] * Add tests for Levenshtein alignment of training data --- tests/test_lev_align.py | 42 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 tests/test_lev_align.py diff --git a/tests/test_lev_align.py b/tests/test_lev_align.py new file mode 100644 index 000000000..2d34c2200 --- /dev/null +++ b/tests/test_lev_align.py @@ -0,0 +1,42 @@ +"""Find the min-cost alignment between two tokenizations""" +from spacy.gold import _min_edit_path as min_edit_path +from spacy.gold import align + + +def test_edit_path(): + cand = ["U.S", ".", "policy"] + gold = ["U.S.", "policy"] + assert min_edit_path(cand, gold) == (0, 'MDM') + cand = ["U.N", ".", "policy"] + gold = ["U.S.", "policy"] + assert min_edit_path(cand, gold) == (1, 'SDM') + cand = ["The", "cat", "sat", "down"] + gold = ["The", "cat", "sat", "down"] + assert min_edit_path(cand, gold) == (0, 'MMMM') + cand = ["cat", "sat", "down"] + gold = ["The", "cat", "sat", "down"] + assert min_edit_path(cand, gold) == (1, 'IMMM') + cand = ["The", "cat", "down"] + gold = ["The", "cat", "sat", "down"] + assert min_edit_path(cand, gold) == (1, 'MMIM') + cand = ["The", "cat", "sag", "down"] + gold = ["The", "cat", "sat", "down"] + assert min_edit_path(cand, gold) == (1, 'MMSM') + cand = ["your", "stuff"] + gold = ["you", "r", "stuff"] + assert min_edit_path(cand, gold) in [(2, 'ISM'), (2, 'SIM')] + + +def test_align(): + cand = ["U.S", ".", "policy"] + gold = ["U.S.", "policy"] + assert align(cand, gold) == [0, None, 1] + cand = ["your", "stuff"] + gold = ["you", "r", "stuff"] + assert align(cand, gold) == [None, 2] + cand = [u'i', u'like', u'2', u'guys', u' ', u'well', u'id', u'just', + u'come', u'straight', u'out'] + gold = [u'i', u'like', u'2', u'guys', u'well', u'i', u'd', u'just', u'come', + u'straight', u'out'] + assert align(cand, gold) == [0, 1, 2, 3, None, 4, None, 7, 8, 9, 10] +