fog/test/tokenizers/ngrams_test.py

# =============================================================================
# Fog Ngrams Unit Tests
# =============================================================================
import math
from fog.tokenizers import ngrams, bigrams, trigrams, quadrigrams

ALIASES = [None, bigrams, trigrams, quadrigrams]

STRING = 'Bonjour'

STRING_TESTS = [
    ('B', 'o', 'n', 'j', 'o', 'u', 'r'),
    ('Bo', 'on', 'nj', 'jo', 'ou', 'ur'),
    ('Bon', 'onj', 'njo', 'jou', 'our'),
    ('Bonj', 'onjo', 'njou', 'jour')
]

SENTENCE = tuple('the cat eats the mouse'.split(' '))

SENTENCE_TEST = [
    (('the',), ('cat',), ('eats',), ('the',), ('mouse',)),
    (('the', 'cat'), ('cat', 'eats'), ('eats', 'the'), ('the', 'mouse')),
    (('the', 'cat', 'eats'), ('cat', 'eats', 'the'), ('eats', 'the', 'mouse')),
    (('the', 'cat', 'eats', 'the'), ('cat', 'eats', 'the', 'mouse'))
]


class TestNgrams(object):
    def test_basics(self):
        for i in range(4):
            assert tuple(ngrams(i + 1, STRING)) == STRING_TESTS[i]
            assert tuple(ngrams(i + 1, SENTENCE)) == SENTENCE_TEST[i]

            alias = ALIASES[i]

            if alias is not None:
                assert tuple(alias(STRING)) == STRING_TESTS[i]
                assert tuple(alias(SENTENCE)) == SENTENCE_TEST[i]