Add tokenizer exceptions for a.m. and p.m. in Spanish

2016-12-21 18:19:10 +01:00 · 2016-12-21 18:19:10 +01:00 · 3c87c71d43
parent d1a2846750
commit 3c87c71d43
1 changed files with 33 additions and 0 deletions
--- a/spacy/es/language_data.py
+++ b/spacy/es/language_data.py
@ -3,12 +3,45 @@ from __future__ import unicode_literals

 from .. import language_data as base
 from ..language_data import update_exc, strings_to_exc
+from ..symbols import ORTH, LEMMA

 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY


+def get_time_exc(hours):
+    exc = {
+        "12m.": [
+            {ORTH: "12"},
+            {ORTH: "m.", LEMMA: "p.m."}
+        ]
+    }
+
+    for hour in hours:
+        exc["%da.m." % hour] = [
+            {ORTH: hour},
+            {ORTH: "a.m."}
+        ]
+
+        exc["%dp.m." % hour] = [
+            {ORTH: hour},
+            {ORTH: "p.m."}
+        ]
+
+        exc["%dam" % hour] = [
+            {ORTH: hour},
+            {ORTH: "am", LEMMA: "a.m."}
+        ]
+
+        exc["%dpm" % hour] = [
+            {ORTH: hour},
+            {ORTH: "pm", LEMMA: "p.m."}
+        ]
+    return exc
+
+
 TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
+update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
 STOP_WORDS = set(STOP_WORDS)