mirror of https://github.com/explosion/spaCy.git
feature(model): Add support for creating the Spanish model, including rich tagset, configuration, and basich tests
This commit is contained in:
parent
010293fb2f
commit
c693d40791
|
@ -17,4 +17,5 @@ class Spanish(Language):
|
|||
lex_attr_getters[LANG] = lambda text: 'es'
|
||||
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
|
|
|
@ -5,6 +5,7 @@ from .. import language_data as base
|
|||
from ..language_data import update_exc, strings_to_exc
|
||||
from ..symbols import ORTH, LEMMA
|
||||
|
||||
from .tag_map import TAG_MAP
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
|
||||
|
||||
|
@ -39,7 +40,7 @@ def get_time_exc(hours):
|
|||
]
|
||||
return exc
|
||||
|
||||
|
||||
TAG_MAP = dict(TAG_MAP)
|
||||
STOP_WORDS = set(STOP_WORDS)
|
||||
|
||||
|
||||
|
@ -51,4 +52,4 @@ update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
|||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||
|
||||
|
||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
||||
__all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS"]
|
||||
|
|
1043
spacy/es/tag_map.py
1043
spacy/es/tag_map.py
File diff suppressed because it is too large
Load Diff
|
@ -49,6 +49,10 @@ def en_vocab():
|
|||
def en_parser():
|
||||
return English.Defaults.create_parser()
|
||||
|
||||
@pytest.fixture
|
||||
def es_tokenizer():
|
||||
return Spanish.Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def de_tokenizer():
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
# coding: utf-8
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,lemma', [("aprox.", "aproximadamente"),
|
||||
("esq.", "esquina"),
|
||||
("pág.", "página"),
|
||||
("p.ej.", "por ejemplo")
|
||||
])
|
||||
def test_tokenizer_handles_abbr(es_tokenizer, text, lemma):
|
||||
tokens = es_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
assert tokens[0].lemma_ == lemma
|
||||
|
||||
|
||||
def test_tokenizer_handles_exc_in_text(es_tokenizer):
|
||||
text = "Mariano Rajoy ha corrido aprox. medio kilómetro"
|
||||
tokens = es_tokenizer(text)
|
||||
assert len(tokens) == 7
|
||||
assert tokens[4].text == "aprox."
|
||||
assert tokens[4].lemma_ == "aproximadamente"
|
|
@ -0,0 +1,35 @@
|
|||
# coding: utf-8
|
||||
|
||||
"""Test that longer and mixed texts are tokenized correctly."""
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def test_tokenizer_handles_long_text(es_tokenizer):
|
||||
text = """Cuando a José Mujica lo invitaron a dar una conferencia
|
||||
|
||||
en Oxford este verano, su cabeza hizo "crac". La "más antigua" universidad de habla
|
||||
|
||||
inglesa, esa que cobra decenas de miles de euros de matrícula a sus alumnos
|
||||
|
||||
y en cuyos salones han disertado desde Margaret Thatcher hasta Stephen Hawking,
|
||||
|
||||
reclamaba los servicios de este viejo de 81 años, formado en un colegio público
|
||||
|
||||
en Montevideo y que pregona las bondades de la vida austera."""
|
||||
tokens = es_tokenizer(text)
|
||||
assert len(tokens) == 90
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,length', [
|
||||
("¿Por qué José Mujica?", 6),
|
||||
("“¿Oh no?”", 6),
|
||||
("""¡Sí! "Vámonos", contestó José Arcadio Buendía""", 11),
|
||||
("Corrieron aprox. 10km.", 5),
|
||||
("Y entonces por qué...", 5)])
|
||||
def test_tokenizer_handles_cnts(es_tokenizer, text, length):
|
||||
tokens = es_tokenizer(text)
|
||||
assert len(tokens) == length
|
Loading…
Reference in New Issue