mirror of https://github.com/explosion/spaCy.git
Added tokenizer tests
This commit is contained in:
parent
cbfe4920bb
commit
8de59ce3b9
|
@ -4,7 +4,6 @@ from __future__ import unicode_literals
|
|||
from . import util
|
||||
from .deprecated import resolve_model_name
|
||||
from .cli.info import info
|
||||
<<<<<<< HEAD
|
||||
|
||||
|
||||
from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb
|
||||
|
@ -18,39 +17,6 @@ _languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French,
|
|||
for _lang in _languages:
|
||||
util.set_lang_class(_lang.lang, _lang)
|
||||
|
||||
from . import en
|
||||
from . import de
|
||||
from . import zh
|
||||
from . import es
|
||||
from . import it
|
||||
from . import hu
|
||||
from . import fr
|
||||
from . import pt
|
||||
from . import nl
|
||||
from . import sv
|
||||
from . import fi
|
||||
from . import bn
|
||||
from . import he
|
||||
from . import nb
|
||||
|
||||
from .about import *
|
||||
|
||||
|
||||
set_lang_class(en.English.lang, en.English)
|
||||
set_lang_class(de.German.lang, de.German)
|
||||
set_lang_class(es.Spanish.lang, es.Spanish)
|
||||
set_lang_class(pt.Portuguese.lang, pt.Portuguese)
|
||||
set_lang_class(fr.French.lang, fr.French)
|
||||
set_lang_class(it.Italian.lang, it.Italian)
|
||||
set_lang_class(hu.Hungarian.lang, hu.Hungarian)
|
||||
set_lang_class(zh.Chinese.lang, zh.Chinese)
|
||||
set_lang_class(nl.Dutch.lang, nl.Dutch)
|
||||
set_lang_class(sv.Swedish.lang, sv.Swedish)
|
||||
set_lang_class(fi.Finnish.lang, fi.Finnish)
|
||||
set_lang_class(bn.Bengali.lang, bn.Bengali)
|
||||
set_lang_class(he.Hebrew.lang, he.Hebrew)
|
||||
set_lang_class(nb.Norwegian.lang, nb.Norwegian)
|
||||
|
||||
|
||||
def load(name, **overrides):
|
||||
if overrides.get('path') in (None, False, True):
|
||||
|
@ -72,8 +38,4 @@ def load(name, **overrides):
|
|||
cls = util.get_lang_class(lang)
|
||||
overrides['meta'] = meta
|
||||
overrides['path'] = model_path
|
||||
return cls(**overrides)
|
||||
|
||||
|
||||
def info(name, markdown):
|
||||
info(name, markdown)
|
||||
return cls(**overrides)
|
|
@ -0,0 +1,17 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
NB_TOKEN_EXCEPTION_TESTS = [
|
||||
('Smørsausen brukes bl.a. til fisk', ['Smørsausen', 'brukes', 'bl.a.', 'til', 'fisk']),
|
||||
('Jeg kommer først kl. 13 pga. diverse forsinkelser', ['Jeg', 'kommer', 'først', 'kl.', '13', 'pga.', 'diverse', 'forsinkelser'])
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,expected_tokens', NB_TOKEN_EXCEPTION_TESTS)
|
||||
def test_tokenizer_handles_exception_cases(nb_tokenizer, text, expected_tokens):
|
||||
tokens = nb_tokenizer(text)
|
||||
token_list = [token.text for token in tokens if not token.is_space]
|
||||
assert expected_tokens == token_list
|
Loading…
Reference in New Issue