diff --git a/spacy/tests/gold/test_lev_align.py b/spacy/tests/gold/test_lev_align.py new file mode 100644 index 000000000..29f58a156 --- /dev/null +++ b/spacy/tests/gold/test_lev_align.py @@ -0,0 +1,36 @@ +# coding: utf-8 +"""Find the min-cost alignment between two tokenizations""" + +from __future__ import unicode_literals + +from ...gold import _min_edit_path as min_edit_path +from ...gold import align + +import pytest + + +@pytest.mark.parametrize('cand,gold,path', [ + (["U.S", ".", "policy"], ["U.S.", "policy"], (0, 'MDM')), + (["U.N", ".", "policy"], ["U.S.", "policy"], (1, 'SDM')), + (["The", "cat", "sat", "down"], ["The", "cat", "sat", "down"], (0, 'MMMM')), + (["cat", "sat", "down"], ["The", "cat", "sat", "down"], (1, 'IMMM')), + (["The", "cat", "down"], ["The", "cat", "sat", "down"], (1, 'MMIM')), + (["The", "cat", "sag", "down"], ["The", "cat", "sat", "down"], (1, 'MMSM'))]) +def test_gold_lev_align_edit_path(cand, gold, path): + assert min_edit_path(cand, gold) == path + + +def test_gold_lev_align_edit_path2(): + cand = ["your", "stuff"] + gold = ["you", "r", "stuff"] + assert min_edit_path(cand, gold) in [(2, 'ISM'), (2, 'SIM')] + + +@pytest.mark.parametrize('cand,gold,result', [ + (["U.S", ".", "policy"], ["U.S.", "policy"], [0, None, 1]), + (["your", "stuff"], ["you", "r", "stuff"], [None, 2]), + (["i", "like", "2", "guys", " ", "well", "id", "just", "come", "straight", "out"], + ["i", "like", "2", "guys", "well", "i", "d", "just", "come", "straight", "out"], + [0, 1, 2, 3, None, 4, None, 7, 8, 9, 10])]) +def test_gold_lev_align(cand, gold, result): + assert align(cand, gold) == result diff --git a/spacy/tests/munge/test_lev_align.py b/spacy/tests/munge/test_lev_align.py deleted file mode 100644 index 2d34c2200..000000000 --- a/spacy/tests/munge/test_lev_align.py +++ /dev/null @@ -1,42 +0,0 @@ -"""Find the min-cost alignment between two tokenizations""" -from spacy.gold import _min_edit_path as min_edit_path -from spacy.gold import align - - -def test_edit_path(): - cand = ["U.S", ".", "policy"] - gold = ["U.S.", "policy"] - assert min_edit_path(cand, gold) == (0, 'MDM') - cand = ["U.N", ".", "policy"] - gold = ["U.S.", "policy"] - assert min_edit_path(cand, gold) == (1, 'SDM') - cand = ["The", "cat", "sat", "down"] - gold = ["The", "cat", "sat", "down"] - assert min_edit_path(cand, gold) == (0, 'MMMM') - cand = ["cat", "sat", "down"] - gold = ["The", "cat", "sat", "down"] - assert min_edit_path(cand, gold) == (1, 'IMMM') - cand = ["The", "cat", "down"] - gold = ["The", "cat", "sat", "down"] - assert min_edit_path(cand, gold) == (1, 'MMIM') - cand = ["The", "cat", "sag", "down"] - gold = ["The", "cat", "sat", "down"] - assert min_edit_path(cand, gold) == (1, 'MMSM') - cand = ["your", "stuff"] - gold = ["you", "r", "stuff"] - assert min_edit_path(cand, gold) in [(2, 'ISM'), (2, 'SIM')] - - -def test_align(): - cand = ["U.S", ".", "policy"] - gold = ["U.S.", "policy"] - assert align(cand, gold) == [0, None, 1] - cand = ["your", "stuff"] - gold = ["you", "r", "stuff"] - assert align(cand, gold) == [None, 2] - cand = [u'i', u'like', u'2', u'guys', u' ', u'well', u'id', u'just', - u'come', u'straight', u'out'] - gold = [u'i', u'like', u'2', u'guys', u'well', u'i', u'd', u'just', u'come', - u'straight', u'out'] - assert align(cand, gold) == [0, 1, 2, 3, None, 4, None, 7, 8, 9, 10] -