mirror of https://github.com/explosion/spaCy.git
new tests & tokenization fixes (#4734)
- added some tests for tokenization issues - fixed some issues with tokenization of words with hyphen infix - rewrote the "tokenizer_exceptions.py" file (stemming from the German version)
This commit is contained in:
parent
48ea2e8d0f
commit
a7ee4b6f17
|
@ -6,7 +6,7 @@ from __future__ import unicode_literals
|
|||
# variants (vläicht = vlaicht, vleicht, viläicht, viläischt, etc. etc.)
|
||||
# here one could include the most common spelling mistakes
|
||||
|
||||
_exc = {"datt": "dass", "wgl.": "weg.", "vläicht": "viläicht"}
|
||||
_exc = {"dass": "datt", "viläicht": "vläicht"}
|
||||
|
||||
|
||||
NORM_EXCEPTIONS = {}
|
||||
|
|
|
@ -1,16 +1,23 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..punctuation import TOKENIZER_INFIXES
|
||||
from ..char_classes import ALPHA
|
||||
|
||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||
|
||||
ELISION = " ' ’ ".strip().replace(" ", "")
|
||||
HYPHENS = r"- – — ‐ ‑".strip().replace(" ", "")
|
||||
|
||||
|
||||
_infixes = TOKENIZER_INFIXES + [
|
||||
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
|
||||
]
|
||||
_infixes = (
|
||||
LIST_ELLIPSES
|
||||
+ LIST_ICONS
|
||||
+ [
|
||||
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
|
||||
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[0-9])-(?=[0-9])",
|
||||
]
|
||||
)
|
||||
|
||||
TOKENIZER_INFIXES = _infixes
|
||||
|
|
|
@ -10,7 +10,9 @@ _exc = {}
|
|||
|
||||
# translate / delete what is not necessary
|
||||
for exc_data in [
|
||||
{ORTH: "wgl.", LEMMA: "wann ech gelift", NORM: "wann ech gelieft"},
|
||||
{ORTH: "'t", LEMMA: "et", NORM: "et"},
|
||||
{ORTH: "'T", LEMMA: "et", NORM: "et"},
|
||||
{ORTH: "wgl.", LEMMA: "wannechgelift", NORM: "wannechgelift"},
|
||||
{ORTH: "M.", LEMMA: "Monsieur", NORM: "Monsieur"},
|
||||
{ORTH: "Mme.", LEMMA: "Madame", NORM: "Madame"},
|
||||
{ORTH: "Dr.", LEMMA: "Dokter", NORM: "Dokter"},
|
||||
|
@ -18,7 +20,7 @@ for exc_data in [
|
|||
{ORTH: "asw.", LEMMA: "an sou weider", NORM: "an sou weider"},
|
||||
{ORTH: "etc.", LEMMA: "et cetera", NORM: "et cetera"},
|
||||
{ORTH: "bzw.", LEMMA: "bezéiungsweis", NORM: "bezéiungsweis"},
|
||||
{ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"},
|
||||
{ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"}
|
||||
]:
|
||||
_exc[exc_data[ORTH]] = [exc_data]
|
||||
|
||||
|
|
|
@ -3,8 +3,24 @@ from __future__ import unicode_literals
|
|||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["z.B.", "Jan."])
|
||||
def test_lb_tokenizer_handles_abbr(lb_tokenizer, text):
|
||||
tokens = lb_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
||||
@pytest.mark.parametrize("text", ["d'Saach", "d'Kanner", "d’Welt", "d’Suen"])
|
||||
def test_lb_tokenizer_splits_contractions(lb_tokenizer, text):
|
||||
tokens = lb_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
||||
def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer):
|
||||
text = "Mee 't ass net evident, d'Liewen."
|
||||
tokens = lb_tokenizer(text)
|
||||
assert len(tokens) == 9
|
||||
assert tokens[1].text == "'t"
|
||||
assert tokens[1].lemma_ == "et"
|
||||
|
||||
@pytest.mark.parametrize("text,norm", [("dass", "datt"), ("viläicht", "vläicht")])
|
||||
def test_lb_norm_exceptions(lb_tokenizer, text, norm):
|
||||
tokens = lb_tokenizer(text)
|
||||
assert tokens[0].norm_ == norm
|
||||
|
|
|
@ -16,6 +16,7 @@ def test_lb_tokenizer_handles_long_text(lb_tokenizer):
|
|||
[
|
||||
("»Wat ass mat mir geschitt?«, huet hie geduecht.", 13),
|
||||
("“Dëst fréi Opstoen”, denkt hien, “mécht ee ganz duercherneen. ", 15),
|
||||
("Am Grand-Duché ass d'Liewen schéin, mee 't gëtt ze vill Autoen.", 14)
|
||||
],
|
||||
)
|
||||
def test_lb_tokenizer_handles_examples(lb_tokenizer, text, length):
|
||||
|
|
Loading…
Reference in New Issue