feature(model): Add support for creating the Spanish model, including rich tagset, configuration, and basich tests

2017-04-06 18:48:45 +02:00 · 2017-04-06 18:48:45 +02:00 · c693d40791
parent 010293fb2f
commit c693d40791
7 changed files with 376 additions and 742 deletions
--- a/spacy/es/init.py
+++ b/spacy/es/init.py
@ -17,4 +17,5 @@ class Spanish(Language):
        lex_attr_getters[LANG] = lambda text: 'es'

        tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+        tag_map = TAG_MAP
        stop_words = STOP_WORDS
--- a/spacy/es/language_data.py
+++ b/spacy/es/language_data.py
@ -5,6 +5,7 @@ from .. import language_data as base
 from ..language_data import update_exc, strings_to_exc
 from ..symbols import ORTH, LEMMA

+from .tag_map import TAG_MAP
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY

@ -39,7 +40,7 @@ def get_time_exc(hours):
        ]
    return exc

-
+TAG_MAP = dict(TAG_MAP)
 STOP_WORDS = set(STOP_WORDS)


@ -51,4 +52,4 @@ update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))


-__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
+__all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS"]
--- a/spacy/es/tag_map.py
+++ b/spacy/es/tag_map.py
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -49,6 +49,10 @@ def en_vocab():
 def en_parser():
    return English.Defaults.create_parser()

+@pytest.fixture
+def es_tokenizer():
+    return Spanish.Defaults.create_tokenizer()
+

@pytest.fixture
 def de_tokenizer():
--- a/spacy/tests/es/init.py
+++ b/spacy/tests/es/init.py
--- a/spacy/tests/es/test_exception.py
+++ b/spacy/tests/es/test_exception.py
@ -0,0 +1,24 @@
+# coding: utf-8
+
+from __future__ import unicode_literals
+
+import pytest
+
+
+@pytest.mark.parametrize('text,lemma', [("aprox.", "aproximadamente"),
+                                        ("esq.", "esquina"),
+                                        ("pág.", "página"),
+                                        ("p.ej.", "por ejemplo")
+                                        ])
+def test_tokenizer_handles_abbr(es_tokenizer, text, lemma):
+    tokens = es_tokenizer(text)
+    assert len(tokens) == 1
+    assert tokens[0].lemma_ == lemma
+
+
+def test_tokenizer_handles_exc_in_text(es_tokenizer):
+    text = "Mariano Rajoy ha corrido aprox. medio kilómetro"
+    tokens = es_tokenizer(text)
+    assert len(tokens) == 7
+    assert tokens[4].text == "aprox."
+    assert tokens[4].lemma_ == "aproximadamente"
--- a/spacy/tests/es/test_text.py
+++ b/spacy/tests/es/test_text.py
@ -0,0 +1,35 @@
+# coding: utf-8
+
+"""Test that longer and mixed texts are tokenized correctly."""
+
+
+from __future__ import unicode_literals
+
+import pytest
+
+
+def test_tokenizer_handles_long_text(es_tokenizer):
+    text = """Cuando a José Mujica lo invitaron a dar una conferencia
+
+en Oxford este verano, su cabeza hizo "crac". La "más antigua" universidad de habla
+
+inglesa, esa que cobra decenas de miles de euros de matrícula a sus alumnos
+
+y en cuyos salones han disertado desde Margaret Thatcher hasta Stephen Hawking,
+
+reclamaba los servicios de este viejo de 81 años, formado en un colegio público
+
+en Montevideo y que pregona las bondades de la vida austera."""
+    tokens = es_tokenizer(text)
+    assert len(tokens) == 90
+
+
+@pytest.mark.parametrize('text,length', [
+    ("¿Por qué José Mujica?", 6),
+    ("“¿Oh no?”", 6),
+    ("""¡Sí! "Vámonos", contestó José Arcadio Buendía""", 11),
+    ("Corrieron aprox. 10km.", 5),
+    ("Y entonces por qué...", 5)])
+def test_tokenizer_handles_cnts(es_tokenizer, text, length):
+    tokens = es_tokenizer(text)
+    assert len(tokens) == length