spaCy/spacy/tests/lang/fr/test_exceptions.py

# coding: utf-8
from __future__ import unicode_literals

import pytest


@pytest.mark.parametrize(
    "text", ["aujourd'hui", "Aujourd'hui", "prud'hommes", "prud’hommal",
             "audio-numérique", "Audio-numérique",
             "entr'amis", "entr'abat", "rentr'ouvertes", "grand'hamien",
             "Châteauneuf-la-Forêt", "Château-Guibert",
             "11-septembre", "11-Septembre", "refox-trottâmes",
             "K-POP", "K-Pop", "K-pop", "z'yeutes",
             "black-outeront", "états-unienne",
             "courtes-pattes", "court-pattes",
             "saut-de-ski", "Écourt-Saint-Quentin", "Bout-de-l'Îlien", "pet-en-l'air"]
)
def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text):
    tokens = fr_tokenizer(text)
    assert len(tokens) == 1


@pytest.mark.parametrize(
    "text,lemma",
    [
        ("janv.", "janvier"),
        ("juill.", "juillet"),
        ("Dr.", "docteur"),
        ("av.", "avant"),
        ("sept.", "septembre"),
    ],
)
def test_fr_tokenizer_handles_abbr(fr_tokenizer, text, lemma):
    tokens = fr_tokenizer(text)
    assert len(tokens) == 1
    assert tokens[0].lemma_ == lemma


def test_fr_tokenizer_handles_exc_in_text(fr_tokenizer):
    text = "Je suis allé au mois de janv. aux prud’hommes."
    tokens = fr_tokenizer(text)
    assert len(tokens) == 10
    assert tokens[6].text == "janv."
    assert tokens[6].lemma_ == "janvier"
    assert tokens[8].text == "prud’hommes"


def test_fr_tokenizer_handles_exc_in_text_2(fr_tokenizer):
    text = "Cette après-midi, je suis allé dans un restaurant italo-mexicain."
    tokens = fr_tokenizer(text)
    assert len(tokens) == 11
    assert tokens[1].text == "après-midi"
    assert tokens[9].text == "italo-mexicain"


def test_fr_tokenizer_handles_title(fr_tokenizer):
    text = "N'est-ce pas génial?"
    tokens = fr_tokenizer(text)
    assert len(tokens) == 6
    assert tokens[0].text == "N'"
    assert tokens[0].lemma_ == "ne"
    assert tokens[1].text == "est"
    assert tokens[1].lemma_ == "être"
    assert tokens[2].text == "-ce"
    assert tokens[2].lemma_ == "ce"


@pytest.mark.xfail
def test_fr_tokenizer_handles_title_2(fr_tokenizer):
    text = "Est-ce pas génial?"
    tokens = fr_tokenizer(text)
    assert len(tokens) == 6
    assert tokens[0].text == "Est"
    assert tokens[0].lemma_ == "être"


def test_fr_tokenizer_handles_title_3(fr_tokenizer):
    text = "Qu'est-ce que tu fais?"
    tokens = fr_tokenizer(text)
    assert len(tokens) == 7
    assert tokens[0].text == "Qu'"
    assert tokens[0].lemma_ == "que"
-												Add fr tokenization unit tests


											
										
										
											2017-01-24 09:55:02 +00:00
+								# coding: utf-8
 								from __future__ import unicode_literals
 								import pytest
-												Try to fix memory error by moving fr_tokenizer to module scope

											
										
										
											2018-07-24 18:09:06 +00:00
-												Add fr tokenization unit tests


											
										
										
											2017-01-24 09:55:02 +00:00
-												💫 Tidy up and auto-format tests (#2967)

* Auto-format tests with black

* Add flake8 config

* Tidy up and remove unused imports

* Fix redefinitions of test functions

* Replace orths_and_spaces with words and spaces

* Fix compatibility with pytest 4.0

* xfail test for now

Test was previously overwritten by following test due to naming conflict, so failure wasn't reported

* Unfail passing test

* Only use fixture via arguments

Fixes pytest 4.0 compatibility

											
										
										
											2018-11-27 00:09:36 +00:00
+								@pytest.mark.parametrize(
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "text", ["aujourd'hui", "Aujourd'hui", "prud'hommes", "prud’hommal",
 								             "audio-numérique", "Audio-numérique",
 								             "entr'amis", "entr'abat", "rentr'ouvertes", "grand'hamien",
 								             "Châteauneuf-la-Forêt", "Château-Guibert",
 								             "11-septembre", "11-Septembre", "refox-trottâmes",
 								             "K-POP", "K-Pop", "K-pop", "z'yeutes",
 								             "black-outeront", "états-unienne",
 								             "courtes-pattes", "court-pattes",
 								             "saut-de-ski", "Écourt-Saint-Quentin", "Bout-de-l'Îlien", "pet-en-l'air"]
-												💫 Tidy up and auto-format tests (#2967)

* Auto-format tests with black

* Add flake8 config

* Tidy up and remove unused imports

* Fix redefinitions of test functions

* Replace orths_and_spaces with words and spaces

* Fix compatibility with pytest 4.0

* xfail test for now

Test was previously overwritten by following test due to naming conflict, so failure wasn't reported

* Unfail passing test

* Only use fixture via arguments

Fixes pytest 4.0 compatibility

											
										
										
											2018-11-27 00:09:36 +00:00
+								)
-												💫 Refactor test suite (#2568)

## Description

Related issues: #2379 (should be fixed by separating model tests)

* **total execution time down from > 300 seconds to under 60 seconds** 🎉
* removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure
* changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version)
* merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways)
* tidied up and rewrote existing tests wherever possible

### Todo

- [ ] move tests to `/tests` and adjust CI commands accordingly
- [x] move model test suite from internal repo to `spacy-models`
- [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~
- [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted
- [ ] update documentation on how to run tests


### Types of change
enhancement, tests

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-07-24 21:38:44 +00:00
+								def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text):
-												Add fr tokenization unit tests


											
										
										
											2017-01-24 09:55:02 +00:00
+								    tokens = fr_tokenizer(text)
 								    assert len(tokens) == 1
-												💫 Tidy up and auto-format tests (#2967)

* Auto-format tests with black

* Add flake8 config

* Tidy up and remove unused imports

* Fix redefinitions of test functions

* Replace orths_and_spaces with words and spaces

* Fix compatibility with pytest 4.0

* xfail test for now

Test was previously overwritten by following test due to naming conflict, so failure wasn't reported

* Unfail passing test

* Only use fixture via arguments

Fixes pytest 4.0 compatibility

											
										
										
											2018-11-27 00:09:36 +00:00
+								@pytest.mark.parametrize(
 								    "text,lemma",
 								    [
 								        ("janv.", "janvier"),
 								        ("juill.", "juillet"),
 								        ("Dr.", "docteur"),
 								        ("av.", "avant"),
 								        ("sept.", "septembre"),
 								    ],
 								)
-												💫 Refactor test suite (#2568)

## Description

Related issues: #2379 (should be fixed by separating model tests)

* **total execution time down from > 300 seconds to under 60 seconds** 🎉
* removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure
* changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version)
* merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways)
* tidied up and rewrote existing tests wherever possible

### Todo

- [ ] move tests to `/tests` and adjust CI commands accordingly
- [x] move model test suite from internal repo to `spacy-models`
- [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~
- [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted
- [ ] update documentation on how to run tests


### Types of change
enhancement, tests

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-07-24 21:38:44 +00:00
+								def test_fr_tokenizer_handles_abbr(fr_tokenizer, text, lemma):
-												Add fr tokenization unit tests


											
										
										
											2017-01-24 09:55:02 +00:00
+								    tokens = fr_tokenizer(text)
 								    assert len(tokens) == 1
 								    assert tokens[0].lemma_ == lemma
-												💫 Refactor test suite (#2568)

## Description

Related issues: #2379 (should be fixed by separating model tests)

* **total execution time down from > 300 seconds to under 60 seconds** 🎉
* removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure
* changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version)
* merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways)
* tidied up and rewrote existing tests wherever possible

### Todo

- [ ] move tests to `/tests` and adjust CI commands accordingly
- [x] move model test suite from internal repo to `spacy-models`
- [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~
- [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted
- [ ] update documentation on how to run tests


### Types of change
enhancement, tests

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-07-24 21:38:44 +00:00
+								def test_fr_tokenizer_handles_exc_in_text(fr_tokenizer):
-												Add fr tokenization unit tests


											
										
										
											2017-01-24 09:55:02 +00:00
+								    text = "Je suis allé au mois de janv. aux prud’hommes."
 								    tokens = fr_tokenizer(text)
 								    assert len(tokens) == 10
 								    assert tokens[6].text == "janv."
 								    assert tokens[6].lemma_ == "janvier"
 								    assert tokens[8].text == "prud’hommes"
-												Revert "Revert "Merge pull request #818 from raphael0202/tokenizer_exceptions""

This reverts commit f02a2f9322969a637ee2445efd7d1901d2a0d09a.

											
										
										
											2017-02-10 12:17:05 +00:00
-												💫 Refactor test suite (#2568)

## Description

Related issues: #2379 (should be fixed by separating model tests)

* **total execution time down from > 300 seconds to under 60 seconds** 🎉
* removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure
* changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version)
* merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways)
* tidied up and rewrote existing tests wherever possible

### Todo

- [ ] move tests to `/tests` and adjust CI commands accordingly
- [x] move model test suite from internal repo to `spacy-models`
- [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~
- [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted
- [ ] update documentation on how to run tests


### Types of change
enhancement, tests

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-07-24 21:38:44 +00:00
+								def test_fr_tokenizer_handles_exc_in_text_2(fr_tokenizer):
-												Revert "Revert "Merge pull request #818 from raphael0202/tokenizer_exceptions""

This reverts commit f02a2f9322969a637ee2445efd7d1901d2a0d09a.

											
										
										
											2017-02-10 12:17:05 +00:00
+								    text = "Cette après-midi, je suis allé dans un restaurant italo-mexicain."
 								    tokens = fr_tokenizer(text)
 								    assert len(tokens) == 11
 								    assert tokens[1].text == "après-midi"
-												Adding unitest for tokenization in french (with title)

											
										
										
											2017-04-27 08:59:38 +00:00
+								    assert tokens[9].text == "italo-mexicain"
-												💫 Refactor test suite (#2568)

## Description

Related issues: #2379 (should be fixed by separating model tests)

* **total execution time down from > 300 seconds to under 60 seconds** 🎉
* removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure
* changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version)
* merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways)
* tidied up and rewrote existing tests wherever possible

### Todo

- [ ] move tests to `/tests` and adjust CI commands accordingly
- [x] move model test suite from internal repo to `spacy-models`
- [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~
- [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted
- [ ] update documentation on how to run tests


### Types of change
enhancement, tests

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-07-24 21:38:44 +00:00
 								def test_fr_tokenizer_handles_title(fr_tokenizer):
-												Adding unitest for tokenization in french (with title)

											
										
										
											2017-04-27 08:59:38 +00:00
+								    text = "N'est-ce pas génial?"
 								    tokens = fr_tokenizer(text)
 								    assert len(tokens) == 6
 								    assert tokens[0].text == "N'"
 								    assert tokens[0].lemma_ == "ne"
-												correcting tokenizer exception.
Adding tests for lemmatization

											
										
										
											2017-04-27 09:52:14 +00:00
+								    assert tokens[1].text == "est"
 								    assert tokens[1].lemma_ == "être"
-												Adding unitest for tokenization in french (with title)

											
										
										
											2017-04-27 08:59:38 +00:00
+								    assert tokens[2].text == "-ce"
 								    assert tokens[2].lemma_ == "ce"
-												💫 Refactor test suite (#2568)

## Description

Related issues: #2379 (should be fixed by separating model tests)

* **total execution time down from > 300 seconds to under 60 seconds** 🎉
* removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure
* changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version)
* merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways)
* tidied up and rewrote existing tests wherever possible

### Todo

- [ ] move tests to `/tests` and adjust CI commands accordingly
- [x] move model test suite from internal repo to `spacy-models`
- [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~
- [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted
- [ ] update documentation on how to run tests


### Types of change
enhancement, tests

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-07-24 21:38:44 +00:00
-												💫 Tidy up and auto-format tests (#2967)

* Auto-format tests with black

* Add flake8 config

* Tidy up and remove unused imports

* Fix redefinitions of test functions

* Replace orths_and_spaces with words and spaces

* Fix compatibility with pytest 4.0

* xfail test for now

Test was previously overwritten by following test due to naming conflict, so failure wasn't reported

* Unfail passing test

* Only use fixture via arguments

Fixes pytest 4.0 compatibility

											
										
										
											2018-11-27 00:09:36 +00:00
+								@pytest.mark.xfail
-												💫 Refactor test suite (#2568)

## Description

Related issues: #2379 (should be fixed by separating model tests)

* **total execution time down from > 300 seconds to under 60 seconds** 🎉
* removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure
* changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version)
* merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways)
* tidied up and rewrote existing tests wherever possible

### Todo

- [ ] move tests to `/tests` and adjust CI commands accordingly
- [x] move model test suite from internal repo to `spacy-models`
- [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~
- [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted
- [ ] update documentation on how to run tests


### Types of change
enhancement, tests

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-07-24 21:38:44 +00:00
+								def test_fr_tokenizer_handles_title_2(fr_tokenizer):
-												Adding unitest for tokenization in french (with title)

											
										
										
											2017-04-27 08:59:38 +00:00
+								    text = "Est-ce pas génial?"
 								    tokens = fr_tokenizer(text)
 								    assert len(tokens) == 6
 								    assert tokens[0].text == "Est"
 								    assert tokens[0].lemma_ == "être"
-												💫 Refactor test suite (#2568)

## Description

Related issues: #2379 (should be fixed by separating model tests)

* **total execution time down from > 300 seconds to under 60 seconds** 🎉
* removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure
* changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version)
* merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways)
* tidied up and rewrote existing tests wherever possible

### Todo

- [ ] move tests to `/tests` and adjust CI commands accordingly
- [x] move model test suite from internal repo to `spacy-models`
- [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~
- [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted
- [ ] update documentation on how to run tests


### Types of change
enhancement, tests

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-07-24 21:38:44 +00:00
-												💫 Tidy up and auto-format tests (#2967)

* Auto-format tests with black

* Add flake8 config

* Tidy up and remove unused imports

* Fix redefinitions of test functions

* Replace orths_and_spaces with words and spaces

* Fix compatibility with pytest 4.0

* xfail test for now

Test was previously overwritten by following test due to naming conflict, so failure wasn't reported

* Unfail passing test

* Only use fixture via arguments

Fixes pytest 4.0 compatibility

											
										
										
											2018-11-27 00:09:36 +00:00
+								def test_fr_tokenizer_handles_title_3(fr_tokenizer):
-												Adding unitest for tokenization in french (with title)

											
										
										
											2017-04-27 08:59:38 +00:00
+								    text = "Qu'est-ce que tu fais?"
 								    tokens = fr_tokenizer(text)
 								    assert len(tokens) == 7
 								    assert tokens[0].text == "Qu'"
-												💫 Refactor test suite (#2568)

## Description

Related issues: #2379 (should be fixed by separating model tests)

* **total execution time down from > 300 seconds to under 60 seconds** 🎉
* removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure
* changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version)
* merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways)
* tidied up and rewrote existing tests wherever possible

### Todo

- [ ] move tests to `/tests` and adjust CI commands accordingly
- [x] move model test suite from internal repo to `spacy-models`
- [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~
- [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted
- [ ] update documentation on how to run tests


### Types of change
enhancement, tests

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-07-24 21:38:44 +00:00
+								    assert tokens[0].lemma_ == "que"