Added tokenizer tests

This commit is contained in:
luvogels 2017-04-26 19:10:18 +02:00
parent cbfe4920bb
commit 8de59ce3b9
3 changed files with 18 additions and 39 deletions

View File

@ -4,7 +4,6 @@ from __future__ import unicode_literals
from . import util
from .deprecated import resolve_model_name
from .cli.info import info
<<<<<<< HEAD
from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb
@ -18,39 +17,6 @@ _languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French,
for _lang in _languages:
util.set_lang_class(_lang.lang, _lang)
from . import en
from . import de
from . import zh
from . import es
from . import it
from . import hu
from . import fr
from . import pt
from . import nl
from . import sv
from . import fi
from . import bn
from . import he
from . import nb
from .about import *
set_lang_class(en.English.lang, en.English)
set_lang_class(de.German.lang, de.German)
set_lang_class(es.Spanish.lang, es.Spanish)
set_lang_class(pt.Portuguese.lang, pt.Portuguese)
set_lang_class(fr.French.lang, fr.French)
set_lang_class(it.Italian.lang, it.Italian)
set_lang_class(hu.Hungarian.lang, hu.Hungarian)
set_lang_class(zh.Chinese.lang, zh.Chinese)
set_lang_class(nl.Dutch.lang, nl.Dutch)
set_lang_class(sv.Swedish.lang, sv.Swedish)
set_lang_class(fi.Finnish.lang, fi.Finnish)
set_lang_class(bn.Bengali.lang, bn.Bengali)
set_lang_class(he.Hebrew.lang, he.Hebrew)
set_lang_class(nb.Norwegian.lang, nb.Norwegian)
def load(name, **overrides):
if overrides.get('path') in (None, False, True):
@ -72,8 +38,4 @@ def load(name, **overrides):
cls = util.get_lang_class(lang)
overrides['meta'] = meta
overrides['path'] = model_path
return cls(**overrides)
def info(name, markdown):
info(name, markdown)
return cls(**overrides)

View File

View File

@ -0,0 +1,17 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
NB_TOKEN_EXCEPTION_TESTS = [
('Smørsausen brukes bl.a. til fisk', ['Smørsausen', 'brukes', 'bl.a.', 'til', 'fisk']),
('Jeg kommer først kl. 13 pga. diverse forsinkelser', ['Jeg', 'kommer', 'først', 'kl.', '13', 'pga.', 'diverse', 'forsinkelser'])
]
@pytest.mark.parametrize('text,expected_tokens', NB_TOKEN_EXCEPTION_TESTS)
def test_tokenizer_handles_exception_cases(nb_tokenizer, text, expected_tokens):
tokens = nb_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list