spaCy/spacy/tests/lang/lb/test_exceptions.py

# coding: utf-8
from __future__ import unicode_literals

import pytest

@pytest.mark.parametrize("text", ["z.B.", "Jan."])
def test_lb_tokenizer_handles_abbr(lb_tokenizer, text):
    tokens = lb_tokenizer(text)
    assert len(tokens) == 1

@pytest.mark.parametrize("text", ["d'Saach", "d'Kanner", "d’Welt", "d’Suen"])
def test_lb_tokenizer_splits_contractions(lb_tokenizer, text):
    tokens = lb_tokenizer(text)
    assert len(tokens) == 2

def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer):
    text = "Mee 't ass net evident, d'Liewen."
    tokens = lb_tokenizer(text)
    assert len(tokens) == 9
    assert tokens[1].text == "'t"
    assert tokens[1].lemma_ == "et"

@pytest.mark.parametrize("text,norm", [("dass", "datt"), ("viläicht", "vläicht")])
def test_lb_norm_exceptions(lb_tokenizer, text, norm):
    tokens = lb_tokenizer(text)
    assert tokens[0].norm_ == norm
-												Initial commit: New language Luxembourgish (lb) (#4424)

* new language: Luxembourgish (lb)

* update

* update

* Update and rename .github/CONTRIBUTOR_AGREEMENT.md to .github/contributors/PeterGilles.md

* Update and rename .github/contributors/PeterGilles.md to .github/CONTRIBUTOR_AGREEMENT.md

* Update norm_exceptions.py

* Delete README.md

* moved test_lemma.py

* deactivated 'lemma_lookup = LOOKUP'

* update

* Update conftest.py

* update

* tests updated

* import unicode_literals

* Update spacy/tests/lang/lb/test_text.py

Co-Authored-By: Ines Montani <ines@ines.io>

* Create PeterGilles.md

											
										
										
											2019-10-14 10:27:50 +00:00
+								# coding: utf-8
 								from __future__ import unicode_literals
 								import pytest
 								@pytest.mark.parametrize("text", ["z.B.", "Jan."])
 								def test_lb_tokenizer_handles_abbr(lb_tokenizer, text):
 								    tokens = lb_tokenizer(text)
 								    assert len(tokens) == 1
-												new tests & tokenization fixes (#4734)

- added some tests for tokenization issues
- fixed some issues with tokenization of words with hyphen infix
- rewrote the "tokenizer_exceptions.py" file (stemming from the German version)
											
										
										
											2019-12-01 22:08:21 +00:00
 								@pytest.mark.parametrize("text", ["d'Saach", "d'Kanner", "d’Welt", "d’Suen"])
 								def test_lb_tokenizer_splits_contractions(lb_tokenizer, text):
 								    tokens = lb_tokenizer(text)
 								    assert len(tokens) == 2
 								def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer):
 								    text = "Mee 't ass net evident, d'Liewen."
 								    tokens = lb_tokenizer(text)
 								    assert len(tokens) == 9
 								    assert tokens[1].text == "'t"
 								    assert tokens[1].lemma_ == "et"
 								@pytest.mark.parametrize("text,norm", [("dass", "datt"), ("viläicht", "vläicht")])
 								def test_lb_norm_exceptions(lb_tokenizer, text, norm):
 								    tokens = lb_tokenizer(text)
 								    assert tokens[0].norm_ == norm