mirror of https://github.com/Yomguithereal/fog.git
59 lines
1.7 KiB
Python
59 lines
1.7 KiB
Python
|
# =============================================================================
|
||
|
# Fog Fingerprint Tokenizer Unit Tests
|
||
|
# =============================================================================
|
||
|
from fog.tokenizers import (
|
||
|
create_fingerprint_tokenizer,
|
||
|
fingerprint_tokenizer,
|
||
|
ngrams_fingerprint_tokenizer
|
||
|
)
|
||
|
|
||
|
TESTS = [
|
||
|
('', ''),
|
||
|
('hello', 'hello'),
|
||
|
('Tom Cruise', 'cruise tom'),
|
||
|
('The mouse is a mouse', 'a is mouse the'),
|
||
|
('électricité', 'electricite'),
|
||
|
('\x00Hello', 'hello'),
|
||
|
('Hello?', 'hello')
|
||
|
]
|
||
|
|
||
|
NGRAMS_TESTS = [
|
||
|
(2, '', ''),
|
||
|
(2, 'Paris', 'arispari'),
|
||
|
(1, 'Paris', 'aiprs'),
|
||
|
(2, 'bébé', 'beeb'),
|
||
|
(3, 'PariS', 'ariparris')
|
||
|
]
|
||
|
|
||
|
|
||
|
class TestFingerprintTokenizer(object):
|
||
|
def test_basics(self):
|
||
|
|
||
|
for string, fingerprint in TESTS:
|
||
|
assert ' '.join(fingerprint_tokenizer(string)) == fingerprint
|
||
|
|
||
|
def test_stopwords(self):
|
||
|
tokenizer = create_fingerprint_tokenizer(stopwords=['de'])
|
||
|
|
||
|
assert tokenizer('Université de Paris') == ['paris', 'universite']
|
||
|
|
||
|
def test_digits(self):
|
||
|
tokenizer = create_fingerprint_tokenizer(keep_digits=False)
|
||
|
|
||
|
assert tokenizer('20 grammes de maïß') == ['de', 'grammes', 'maiss']
|
||
|
|
||
|
def test_min_token_size(self):
|
||
|
tokenizer = create_fingerprint_tokenizer(min_token_size=2)
|
||
|
|
||
|
assert tokenizer('a very good cat') == ['cat', 'good', 'very']
|
||
|
|
||
|
def test_split(self):
|
||
|
tokenizer = create_fingerprint_tokenizer(min_token_size=2, split=[',', '-'])
|
||
|
|
||
|
assert tokenizer('l\'université de Bade-Wurt') == ['bade', 'universite', 'wurt']
|
||
|
|
||
|
def test_ngrams(self):
|
||
|
|
||
|
for n, string, fingerprint in NGRAMS_TESTS:
|
||
|
assert ''.join(ngrams_fingerprint_tokenizer(n, string)) == fingerprint
|