diff --git a/tests/test_align.py b/tests/test_align.py new file mode 100644 index 000000000..9d817e107 --- /dev/null +++ b/tests/test_align.py @@ -0,0 +1,35 @@ +from spacy.util import align_tokens + + +def test_perfect_align(): + ref = ['I', 'align', 'perfectly'] + indices = [] + i = 0 + for token in ref: + indices.append((i, i + len(token))) + i += len(token) + aligned = list(align_tokens(ref, indices)) + assert aligned[0] == ('I', [(0, 1)]) + assert aligned[1] == ('align', [(1, 6)]) + assert aligned[2] == ('perfectly', [(6, 15)]) + + +def test_hyphen_align(): + ref = ['I', 'must', 're-align'] + indices = [(0, 1), (1, 5), (5, 7), (7, 8), (8, 13)] + aligned = list(align_tokens(ref, indices)) + assert aligned[0] == ('I', [(0, 1)]) + assert aligned[1] == ('must', [(1, 5)]) + assert aligned[2] == ('re-align', [(5, 7), (7, 8), (8, 13)]) + + +def test_align_continue(): + ref = ['I', 'must', 're-align', 'and', 'continue'] + indices = [(0, 1), (1, 5), (5, 7), (7, 8), (8, 13), (13, 16), (16, 24)] + aligned = list(align_tokens(ref, indices)) + assert aligned[2] == ('re-align', [(5, 7), (7, 8), (8, 13)]) + assert aligned[3] == ('and', [(13, 16)]) + assert aligned[4] == ('continue', [(16, 24)]) + + + diff --git a/tests/test_tokens_from_list.py b/tests/test_tokens_from_list.py new file mode 100644 index 000000000..eef00f403 --- /dev/null +++ b/tests/test_tokens_from_list.py @@ -0,0 +1,9 @@ +from __future__ import unicode_literals + +from spacy.en import EN + +def test1(): + words = ['JAPAN', 'GET', 'LUCKY'] + tokens = EN.tokens_from_list(words) + assert len(tokens) == 3 + assert tokens[0].string == 'JAPAN'