spaCy/spacy/tests/lang/fr/test_exceptions.py

# coding: utf-8
from __future__ import unicode_literals

import pytest
from .... import util

@pytest.fixture(scope='module')
def fr_tokenizer():
    return util.get_lang_class('fr').Defaults.create_tokenizer()


@pytest.mark.parametrize('text', [
    "aujourd'hui", "Aujourd'hui", "prud'hommes", "prud’hommal"])
def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text):
    tokens = fr_tokenizer(text)
    assert len(tokens) == 1


@pytest.mark.parametrize('text,lemma', [
    ("janv.", "janvier"),
    ("juill.", "juillet"),
    ("Dr.", "docteur"),
    ("av.", "avant"),
    ("sept.", "septembre")])
def test_fr_tokenizer_handles_abbr(fr_tokenizer, text, lemma):
    tokens = fr_tokenizer(text)
    assert len(tokens) == 1
    assert tokens[0].lemma_ == lemma


def test_fr_tokenizer_handles_exc_in_text(fr_tokenizer):
    text = "Je suis allé au mois de janv. aux prud’hommes."
    tokens = fr_tokenizer(text)
    assert len(tokens) == 10
    assert tokens[6].text == "janv."
    assert tokens[6].lemma_ == "janvier"
    assert tokens[8].text == "prud’hommes"


def test_fr_tokenizer_handles_exc_in_text_2(fr_tokenizer):
    text = "Cette après-midi, je suis allé dans un restaurant italo-mexicain."
    tokens = fr_tokenizer(text)
    assert len(tokens) == 11
    assert tokens[1].text == "après-midi"
    assert tokens[9].text == "italo-mexicain"


def test_fr_tokenizer_handles_title(fr_tokenizer):
    text = "N'est-ce pas génial?"
    tokens = fr_tokenizer(text)
    assert len(tokens) == 6
    assert tokens[0].text == "N'"
    assert tokens[0].lemma_ == "ne"
    assert tokens[1].text == "est"
    assert tokens[1].lemma_ == "être"
    assert tokens[2].text == "-ce"
    assert tokens[2].lemma_ == "ce"


def test_fr_tokenizer_handles_title_2(fr_tokenizer):
    text = "Est-ce pas génial?"
    tokens = fr_tokenizer(text)
    assert len(tokens) == 6
    assert tokens[0].text == "Est"
    assert tokens[0].lemma_ == "être"


def test_fr_tokenizer_handles_title_2(fr_tokenizer):
    text = "Qu'est-ce que tu fais?"
    tokens = fr_tokenizer(text)
    assert len(tokens) == 7
    assert tokens[0].text == "Qu'"
    assert tokens[0].lemma_ == "que"
-												Add fr tokenization unit tests


											
										
										
											2017-01-24 09:55:02 +00:00
+								# coding: utf-8
 								from __future__ import unicode_literals
 								import pytest
-												Try to fix memory error by moving fr_tokenizer to module scope

											
										
										
											2018-07-24 18:09:06 +00:00
+								from .... import util
 								@pytest.fixture(scope='module')
 								def fr_tokenizer():
 								    return util.get_lang_class('fr').Defaults.create_tokenizer()
-												Add fr tokenization unit tests


											
										
										
											2017-01-24 09:55:02 +00:00
-												💫 Refactor test suite (#2568)

## Description

Related issues: #2379 (should be fixed by separating model tests)

* **total execution time down from > 300 seconds to under 60 seconds** 🎉
* removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure
* changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version)
* merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways)
* tidied up and rewrote existing tests wherever possible

### Todo

- [ ] move tests to `/tests` and adjust CI commands accordingly
- [x] move model test suite from internal repo to `spacy-models`
- [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~
- [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted
- [ ] update documentation on how to run tests


### Types of change
enhancement, tests

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-07-24 21:38:44 +00:00
+								@pytest.mark.parametrize('text', [
 								    "aujourd'hui", "Aujourd'hui", "prud'hommes", "prud’hommal"])
 								def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text):
-												Add fr tokenization unit tests


											
										
										
											2017-01-24 09:55:02 +00:00
+								    tokens = fr_tokenizer(text)
 								    assert len(tokens) == 1
-												💫 Refactor test suite (#2568)

## Description

Related issues: #2379 (should be fixed by separating model tests)

* **total execution time down from > 300 seconds to under 60 seconds** 🎉
* removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure
* changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version)
* merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways)
* tidied up and rewrote existing tests wherever possible

### Todo

- [ ] move tests to `/tests` and adjust CI commands accordingly
- [x] move model test suite from internal repo to `spacy-models`
- [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~
- [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted
- [ ] update documentation on how to run tests


### Types of change
enhancement, tests

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-07-24 21:38:44 +00:00
+								@pytest.mark.parametrize('text,lemma', [
 								    ("janv.", "janvier"),
 								    ("juill.", "juillet"),
 								    ("Dr.", "docteur"),
 								    ("av.", "avant"),
 								    ("sept.", "septembre")])
 								def test_fr_tokenizer_handles_abbr(fr_tokenizer, text, lemma):
-												Add fr tokenization unit tests


											
										
										
											2017-01-24 09:55:02 +00:00
+								    tokens = fr_tokenizer(text)
 								    assert len(tokens) == 1
 								    assert tokens[0].lemma_ == lemma
-												💫 Refactor test suite (#2568)

## Description

Related issues: #2379 (should be fixed by separating model tests)

* **total execution time down from > 300 seconds to under 60 seconds** 🎉
* removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure
* changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version)
* merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways)
* tidied up and rewrote existing tests wherever possible

### Todo

- [ ] move tests to `/tests` and adjust CI commands accordingly
- [x] move model test suite from internal repo to `spacy-models`
- [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~
- [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted
- [ ] update documentation on how to run tests


### Types of change
enhancement, tests

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-07-24 21:38:44 +00:00
+								def test_fr_tokenizer_handles_exc_in_text(fr_tokenizer):
-												Add fr tokenization unit tests


											
										
										
											2017-01-24 09:55:02 +00:00
+								    text = "Je suis allé au mois de janv. aux prud’hommes."
 								    tokens = fr_tokenizer(text)
 								    assert len(tokens) == 10
 								    assert tokens[6].text == "janv."
 								    assert tokens[6].lemma_ == "janvier"
 								    assert tokens[8].text == "prud’hommes"
-												Revert "Revert "Merge pull request #818 from raphael0202/tokenizer_exceptions""

This reverts commit f02a2f9322969a637ee2445efd7d1901d2a0d09a.

											
										
										
											2017-02-10 12:17:05 +00:00
-												💫 Refactor test suite (#2568)

## Description

Related issues: #2379 (should be fixed by separating model tests)

* **total execution time down from > 300 seconds to under 60 seconds** 🎉
* removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure
* changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version)
* merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways)
* tidied up and rewrote existing tests wherever possible

### Todo

- [ ] move tests to `/tests` and adjust CI commands accordingly
- [x] move model test suite from internal repo to `spacy-models`
- [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~
- [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted
- [ ] update documentation on how to run tests


### Types of change
enhancement, tests

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-07-24 21:38:44 +00:00
+								def test_fr_tokenizer_handles_exc_in_text_2(fr_tokenizer):
-												Revert "Revert "Merge pull request #818 from raphael0202/tokenizer_exceptions""

This reverts commit f02a2f9322969a637ee2445efd7d1901d2a0d09a.

											
										
										
											2017-02-10 12:17:05 +00:00
+								    text = "Cette après-midi, je suis allé dans un restaurant italo-mexicain."
 								    tokens = fr_tokenizer(text)
 								    assert len(tokens) == 11
 								    assert tokens[1].text == "après-midi"
-												Adding unitest for tokenization in french (with title)

											
										
										
											2017-04-27 08:59:38 +00:00
+								    assert tokens[9].text == "italo-mexicain"
-												💫 Refactor test suite (#2568)

## Description

Related issues: #2379 (should be fixed by separating model tests)

* **total execution time down from > 300 seconds to under 60 seconds** 🎉
* removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure
* changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version)
* merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways)
* tidied up and rewrote existing tests wherever possible

### Todo

- [ ] move tests to `/tests` and adjust CI commands accordingly
- [x] move model test suite from internal repo to `spacy-models`
- [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~
- [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted
- [ ] update documentation on how to run tests


### Types of change
enhancement, tests

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-07-24 21:38:44 +00:00
 								def test_fr_tokenizer_handles_title(fr_tokenizer):
-												Adding unitest for tokenization in french (with title)

											
										
										
											2017-04-27 08:59:38 +00:00
+								    text = "N'est-ce pas génial?"
 								    tokens = fr_tokenizer(text)
 								    assert len(tokens) == 6
 								    assert tokens[0].text == "N'"
 								    assert tokens[0].lemma_ == "ne"
-												correcting tokenizer exception.
Adding tests for lemmatization

											
										
										
											2017-04-27 09:52:14 +00:00
+								    assert tokens[1].text == "est"
 								    assert tokens[1].lemma_ == "être"
-												Adding unitest for tokenization in french (with title)

											
										
										
											2017-04-27 08:59:38 +00:00
+								    assert tokens[2].text == "-ce"
 								    assert tokens[2].lemma_ == "ce"
-												💫 Refactor test suite (#2568)

## Description

Related issues: #2379 (should be fixed by separating model tests)

* **total execution time down from > 300 seconds to under 60 seconds** 🎉
* removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure
* changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version)
* merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways)
* tidied up and rewrote existing tests wherever possible

### Todo

- [ ] move tests to `/tests` and adjust CI commands accordingly
- [x] move model test suite from internal repo to `spacy-models`
- [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~
- [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted
- [ ] update documentation on how to run tests


### Types of change
enhancement, tests

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-07-24 21:38:44 +00:00
 								def test_fr_tokenizer_handles_title_2(fr_tokenizer):
-												Adding unitest for tokenization in french (with title)

											
										
										
											2017-04-27 08:59:38 +00:00
+								    text = "Est-ce pas génial?"
 								    tokens = fr_tokenizer(text)
 								    assert len(tokens) == 6
 								    assert tokens[0].text == "Est"
 								    assert tokens[0].lemma_ == "être"
-												💫 Refactor test suite (#2568)

## Description

Related issues: #2379 (should be fixed by separating model tests)

* **total execution time down from > 300 seconds to under 60 seconds** 🎉
* removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure
* changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version)
* merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways)
* tidied up and rewrote existing tests wherever possible

### Todo

- [ ] move tests to `/tests` and adjust CI commands accordingly
- [x] move model test suite from internal repo to `spacy-models`
- [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~
- [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted
- [ ] update documentation on how to run tests


### Types of change
enhancement, tests

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-07-24 21:38:44 +00:00
 								def test_fr_tokenizer_handles_title_2(fr_tokenizer):
-												Adding unitest for tokenization in french (with title)

											
										
										
											2017-04-27 08:59:38 +00:00
+								    text = "Qu'est-ce que tu fais?"
 								    tokens = fr_tokenizer(text)
 								    assert len(tokens) == 7
 								    assert tokens[0].text == "Qu'"
-												💫 Refactor test suite (#2568)

## Description

Related issues: #2379 (should be fixed by separating model tests)

* **total execution time down from > 300 seconds to under 60 seconds** 🎉
* removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure
* changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version)
* merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways)
* tidied up and rewrote existing tests wherever possible

### Todo

- [ ] move tests to `/tests` and adjust CI commands accordingly
- [x] move model test suite from internal repo to `spacy-models`
- [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~
- [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted
- [ ] update documentation on how to run tests


### Types of change
enhancement, tests

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-07-24 21:38:44 +00:00
+								    assert tokens[0].lemma_ == "que"