spaCy/spacy/lang/fr/tokenizer_exceptions.py

# coding: utf8
from __future__ import unicode_literals

import re

from .punctuation import ELISION, HYPHENS
from ..tokenizer_exceptions import URL_PATTERN
from ..char_classes import ALPHA_LOWER, ALPHA
from ...symbols import ORTH, LEMMA, TAG

# not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer
# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
FR_BASE_EXCEPTIONS = ["aujourd'hui", "Aujourd'hui"]


def upper_first_letter(text):
    if len(text) == 0:
        return text
    if len(text) == 1:
        return text.upper()
    return text[0].upper() + text[1:]


def lower_first_letter(text):
    if len(text) == 0:
        return text
    if len(text) == 1:
        return text.lower()
    return text[0].lower() + text[1:]


_exc = {"J.-C.": [{LEMMA: "Jésus", ORTH: "J."}, {LEMMA: "Christ", ORTH: "-C."}]}


for exc_data in [
    {LEMMA: "avant", ORTH: "av."},
    {LEMMA: "janvier", ORTH: "janv."},
    {LEMMA: "février", ORTH: "févr."},
    {LEMMA: "avril", ORTH: "avr."},
    {LEMMA: "juillet", ORTH: "juill."},
    {LEMMA: "septembre", ORTH: "sept."},
    {LEMMA: "octobre", ORTH: "oct."},
    {LEMMA: "novembre", ORTH: "nov."},
    {LEMMA: "décembre", ORTH: "déc."},
    {LEMMA: "après", ORTH: "apr."},
    {LEMMA: "docteur", ORTH: "Dr."},
    {LEMMA: "monsieur", ORTH: "M."},
    {LEMMA: "monsieur", ORTH: "Mr."},
    {LEMMA: "madame", ORTH: "Mme."},
    {LEMMA: "mademoiselle", ORTH: "Mlle."},
    {LEMMA: "numéro", ORTH: "n°"},
    {LEMMA: "degrés", ORTH: "d°"},
    {LEMMA: "saint", ORTH: "St."},
    {LEMMA: "sainte", ORTH: "Ste."},
]:
    _exc[exc_data[ORTH]] = [exc_data]


for orth in ["etc."]:
    _exc[orth] = [{ORTH: orth}]


for verb, verb_lemma in [
    ("a", "avoir"),
    ("est", "être"),
    ("semble", "sembler"),
    ("indique", "indiquer"),
    ("moque", "moquer"),
    ("passe", "passer"),
]:
    for orth in [verb, verb.title()]:
        for pronoun in ["elle", "il", "on"]:
            token = "{}-t-{}".format(orth, pronoun)
            _exc[token] = [
                {LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"},
                {LEMMA: "t", ORTH: "-t"},
                {LEMMA: pronoun, ORTH: "-" + pronoun},
            ]

for verb, verb_lemma in [("est", "être")]:
    for orth in [verb, verb.title()]:
        token = "{}-ce".format(orth)
        _exc[token] = [
            {LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"},
            {LEMMA: "ce", ORTH: "-ce"},
        ]


for pre, pre_lemma in [("qu'", "que"), ("n'", "ne")]:
    for orth in [pre, pre.title()]:
        _exc["%sest-ce" % orth] = [
            {LEMMA: pre_lemma, ORTH: orth, TAG: "ADV"},
            {LEMMA: "être", ORTH: "est", TAG: "VERB"},
            {LEMMA: "ce", ORTH: "-ce"},
        ]


_infixes_exc = []
orig_elision = "'"
orig_hyphen = "-"

# loop through the elison and hyphen characters, and try to substitute the ones that weren't used in the original list
for infix in FR_BASE_EXCEPTIONS:
    variants_infix = {infix}
    for elision_char in [x for x in ELISION if x != orig_elision]:
        variants_infix.update(
            [word.replace(orig_elision, elision_char) for word in variants_infix]
        )
    for hyphen_char in [x for x in ["-", "‐"] if x != orig_hyphen]:
        variants_infix.update(
            [word.replace(orig_hyphen, hyphen_char) for word in variants_infix]
        )
    variants_infix.update([upper_first_letter(word) for word in variants_infix])
    _infixes_exc.extend(variants_infix)

for orth in _infixes_exc:
    _exc[orth] = [{ORTH: orth}]


_hyphen_prefix = [
    "a[ée]ro",
    "abat",
    "a[fg]ro",
    "after",
    "aigues?",
    "am[ée]ricano",
    "anglo",
    "anti",
    "apr[èe]s",
    "arabo",
    "arcs?",
    "archi",
    "arrières?",
    "audio",
    "avant",
    "avion",
    "auto",
    "banc",
    "bas(?:ses?)?",
    "bateaux?",
    "bec?",
    "belles?",
    "beau",
    "best",
    "bio?",
    "bien",
    "blanc",
    "bo[îi]te",
    "bonn?e?s?",
    "bois",
    "bou(?:c|rg)",
    "b[êe]ta",
    "cache",
    "cap(?:ello)?",
    "casse",
    "castel",
    "champ",
    "chapelle",
    "ch[âa]teau(?:neuf)?",
    "chasse",
    "cha(?:ud|t)e?s?",
    "chauffe",
    "chou",
    "chromo",
    "claire?s?",
    "co(?:de|ca)?",
    "compte",
    "contre",
    "cordon",
    "coupe?",
    "courte?s?",
    "couvre",
    "crash",
    "crise",
    "croche",
    "cross",
    "cyber",
    "côte",
    "demi",
    "di(?:sney)?",
    "dix",
    "d[ée]s?",
    "dys",
    "ex?",
    "émirato",
    "entre",
    "est",
    "ethno",
    "ex",
    "extra",
    "extrême",
    "[ée]co",
    "faux",
    "fil",
    "fort",
    "franco?s?",
    "gallo",
    "gardes?",
    "gastro",
    "grande?",
    "gratte",
    "gr[ée]co",
    "gros",
    "g[ée]o",
    "haute?s?",
    "homm?es?",
    "hors",
    "hyper",
    "indo",
    "infra",
    "inter",
    "intra",
    "islamo",
    "italo",
    "jean",
    "labio",
    "latino",
    "live",
    "lot",
    "louis",
    "m[ai]cro",
    "mal",
    "médio",
    "mesnil",
    "mi(?:ni)?",
    "mono",
    "mont?s?",
    "moyen",
    "multi",
    "m[ée]cano",
    "m[ée]dico",
    "m[ée]do",
    "m[ée]ta",
    "mots?",
    "neuro",
    "noix",
    "non",
    "nord",
    "notre",
    "n[ée]o",
    "ouest",
    "outre",
    "ouvre",
    "passe",
    "perce",
    "pharmaco",
    "ph[oy]to",
    "pieds?",
    "pique",
    "poissons?",
    "ponce",
    "pont",
    "po[rs]t",
    "pousse",
    "primo",
    "pro(?:cès|to)?",
    "pare",
    "petite?s?",
    "plessis",
    "porte",
    "pré",
    "prêchi",
    "protège",
    "pseudo",
    "pêle",
    "péri",
    "puy",
    "quasi",
    "quatre",
    "radio",
    "recourt",
    "rythmo",
    "(?:re)?doubles?",
    "r[ée]",
    "r[ée]tro",
    "requin",
    "sans?",
    "sa?inte?s?",
    "semi",
    "serre",
    "sino",
    "socio",
    "sociale?s?",
    "soixante",
    "sous",
    "su[bdrs]",
    "super",
    "taille",
    "tire",
    "thermo",
    "tiers",
    "tourne",
    "toute?s?",
    "tra[iî]ne?",
    "trans",
    "trente",
    "trois",
    "trousse",
    "tr(?:i|ou)",
    "t[ée]l[ée]",
    "utéro",
    "vaso",
    "vi[cd]e",
    "vid[ée]o",
    "vie(?:ux|i?lles?|i?l)",
    "vill(?:e|eneuve|ers|ette|iers|y)",
    "vingt",
    "voitures?",
    "wagons?",
    "ultra",
    "à",
    "[ée]lectro",
    "[ée]qui",
    "Fontaine",
    "La Chapelle",
    "Marie",
    "Le Mesnil",
    "Neuville",
    "Pierre",
    "Val",
    "Vaux",
]

_regular_exp = [
    "^a[{hyphen}]sexualis[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^arginine[{hyphen}]méthyl[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^binge[{hyphen}]watch[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^black[{hyphen}]out[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^bouche[{hyphen}]por[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^burn[{hyphen}]out[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^by[{hyphen}]pass[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^ch[{elision}]tiis[{al}]+$".format(elision=ELISION, al=ALPHA_LOWER),
    "^chape[{hyphen}]chut[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^down[{hyphen}]load[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^[ée]tats[{hyphen}]uni[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^droits?[{hyphen}]de[{hyphen}]l'homm[{al}]+$".format(
        hyphen=HYPHENS, al=ALPHA_LOWER
    ),
    "^fac[{hyphen}]simil[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^fleur[{hyphen}]bleuis[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^flic[{hyphen}]flaqu[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^fox[{hyphen}]trott[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^google[{hyphen}]is[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^hard[{hyphen}]discount[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^hip[{hyphen}]hop[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^jet[{hyphen}]set[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^knock[{hyphen}]out[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^lèche[{hyphen}]bott[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^litho[{hyphen}]typographi[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^lock[{hyphen}]out[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^lombri[{hyphen}]compost[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^mac[{hyphen}]adamis[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^marque[{hyphen}]pag[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^mouton[{hyphen}]noiris[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^new[{hyphen}]york[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^pair[{hyphen}]programm[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^people[{hyphen}]is[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^plan[{hyphen}]socialis[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^premier[{hyphen}]ministr[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^prud[{elision}]hom[{al}]+$".format(elision=ELISION, al=ALPHA_LOWER),
    "^réarc[{hyphen}]bout[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^refox[{hyphen}]trott[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^remicro[{hyphen}]ond[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^repique[{hyphen}]niqu[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^repetit[{hyphen}]déjeun[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^rick[{hyphen}]roll[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^rond[{hyphen}]ponn[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^shift[{hyphen}]cliqu[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^soudo[{hyphen}]bras[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^stabilo[{hyphen}]boss[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^strip[{hyphen}]teas[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^terra[{hyphen}]form[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^teuf[{hyphen}]teuf[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^yo[{hyphen}]yo[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^zig[{hyphen}]zag[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^z[{elision}]yeut[{al}]+$".format(elision=ELISION, al=ALPHA_LOWER),
]
# catching cases like faux-vampire
_regular_exp += [
    "^{prefix}[{hyphen}][{al}][{hyphen}{al}{elision}]*$".format(
        prefix=p,
        hyphen=HYPHENS,   # putting the - first in the [] range avoids having to use a backslash
        elision=ELISION,
        al=ALPHA_LOWER,
    )
    for p in _hyphen_prefix
]

# catching cases like entr'abat
_elision_prefix = ["r?é?entr", "grande?s?", "r"]
_regular_exp += [
    "^{prefix}[{elision}][{al}][{hyphen}{al}{elision}]*$".format(
        prefix=p, elision=ELISION, hyphen=HYPHENS, al=ALPHA_LOWER
    )
    for p in _elision_prefix
]

# catching cases like saut-de-ski, pet-en-l'air
_hyphen_combination = [
    "l[èe]s?",
    "la",
    "en",
    "des?",
    "d[eu]",
    "sur",
    "sous",
    "aux?",
    "à",
    "et",
    "près",
    "saint",
]
_regular_exp += [
    "^[{a}]+[{hyphen}]{hyphen_combo}[{hyphen}](?:l[{elision}])?[{a}]+$".format(
        hyphen_combo=hc, elision=ELISION, hyphen=HYPHENS, a=ALPHA
    )
    for hc in _hyphen_combination
]

# URLs
_regular_exp.append(URL_PATTERN)


TOKENIZER_EXCEPTIONS = _exc
TOKEN_MATCH = re.compile(
    "|".join("(?:{})".format(m) for m in _regular_exp), re.IGNORECASE
).match
-												Use consistent unicode declarations

											
										
										
											2017-03-12 12:07:28 +00:00
+								# coding: utf8
-												Revert "Revert "Merge pull request #818 from raphael0202/tokenizer_exceptions""

This reverts commit f02a2f9322969a637ee2445efd7d1901d2a0d09a.

											
										
										
											2017-02-10 12:17:05 +00:00
+								from __future__ import unicode_literals
-												Replacing regex library with re to increase tokenization speed (#3218)

* replace unicode categories with raw list of code points

* simplifying ranges

* fixing variable length quotes

* removing redundant regular expression

* small cleanup of regexp notations

* quotes and alpha as ranges instead of alterations

* removed most regexp dependencies and features

* exponential backtracking - unit tests

* rewrote expression with pathological backtracking

* disabling double hyphen tests for now

* test additional variants of repeating punctuation

* remove regex and redundant backslashes from load_reddit script

* small typo fixes

* disable double punctuation test for russian

* clean up old comments

* format block code

* final cleanup

* naming consistency

* french strings as unicode for python 2 support

* french regular expression case insensitive

											
										
										
											2019-02-01 07:05:22 +00:00
+								import re
-												Revert "Revert "Merge pull request #818 from raphael0202/tokenizer_exceptions""

This reverts commit f02a2f9322969a637ee2445efd7d1901d2a0d09a.

											
										
										
											2017-02-10 12:17:05 +00:00
-												Fix relative imports

											
										
										
											2017-05-08 20:29:04 +00:00
+								from .punctuation import ELISION, HYPHENS
 								from ..tokenizer_exceptions import URL_PATTERN
-												Replacing regex library with re to increase tokenization speed (#3218)

* replace unicode categories with raw list of code points

* simplifying ranges

* fixing variable length quotes

* removing redundant regular expression

* small cleanup of regexp notations

* quotes and alpha as ranges instead of alterations

* removed most regexp dependencies and features

* exponential backtracking - unit tests

* rewrote expression with pathological backtracking

* disabling double hyphen tests for now

* test additional variants of repeating punctuation

* remove regex and redundant backslashes from load_reddit script

* small typo fixes

* disable double punctuation test for russian

* clean up old comments

* format block code

* final cleanup

* naming consistency

* french strings as unicode for python 2 support

* french regular expression case insensitive

											
										
										
											2019-02-01 07:05:22 +00:00
+								from ..char_classes import ALPHA_LOWER, ALPHA
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								from ...symbols import ORTH, LEMMA, TAG
-												Convert exceptions to Python list

											
										
										
											2017-02-24 17:22:40 +00:00
-												Clean up of char classes, few tokenizer fixes and faster default French tokenizer (#3293)

* splitting up latin unicode interval

* removing hyphen as infix for French

* adding failing test for issue 1235

* test for issue #3002 which now works

* partial fix for issue #2070

* keep the hyphen as infix for French (as it was)

* restore french expressions with hyphen as infix (as it was)

* added succeeding unit test for Issue #2656

* Fix issue #2822 with custom Italian exception

* Fix issue #2926 by allowing numbers right before infix /

* splitting up latin unicode interval

* removing hyphen as infix for French

* adding failing test for issue 1235

* test for issue #3002 which now works

* partial fix for issue #2070

* keep the hyphen as infix for French (as it was)

* restore french expressions with hyphen as infix (as it was)

* added succeeding unit test for Issue #2656

* Fix issue #2822 with custom Italian exception

* Fix issue #2926 by allowing numbers right before infix /

* remove duplicate

* remove xfail for Issue #2179 fixed by Matt

* adjust documentation and remove reference to regex lib

											
										
										
											2019-02-20 21:10:13 +00:00
+								# not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer
 								# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
 								FR_BASE_EXCEPTIONS = ["aujourd'hui", "Aujourd'hui"]
-												Revert "Revert "Merge pull request #818 from raphael0202/tokenizer_exceptions""

This reverts commit f02a2f9322969a637ee2445efd7d1901d2a0d09a.

											
										
										
											2017-02-10 12:17:05 +00:00
 								def upper_first_letter(text):
 								    if len(text) == 0:
 								        return text
 								    if len(text) == 1:
 								        return text.upper()
 								    return text[0].upper() + text[1:]
 								def lower_first_letter(text):
 								    if len(text) == 0:
 								        return text
 								    if len(text) == 1:
 								        return text.lower()
 								    return text[0].lower() + text[1:]
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								_exc = {"J.-C.": [{LEMMA: "Jésus", ORTH: "J."}, {LEMMA: "Christ", ORTH: "-C."}]}
-												Reorganise French language data

											
										
										
											2017-05-08 13:49:05 +00:00
 								for exc_data in [
 								    {LEMMA: "avant", ORTH: "av."},
 								    {LEMMA: "janvier", ORTH: "janv."},
 								    {LEMMA: "février", ORTH: "févr."},
 								    {LEMMA: "avril", ORTH: "avr."},
 								    {LEMMA: "juillet", ORTH: "juill."},
 								    {LEMMA: "septembre", ORTH: "sept."},
 								    {LEMMA: "octobre", ORTH: "oct."},
 								    {LEMMA: "novembre", ORTH: "nov."},
 								    {LEMMA: "décembre", ORTH: "déc."},
 								    {LEMMA: "après", ORTH: "apr."},
 								    {LEMMA: "docteur", ORTH: "Dr."},
 								    {LEMMA: "monsieur", ORTH: "M."},
 								    {LEMMA: "monsieur", ORTH: "Mr."},
 								    {LEMMA: "madame", ORTH: "Mme."},
 								    {LEMMA: "mademoiselle", ORTH: "Mlle."},
 								    {LEMMA: "numéro", ORTH: "n°"},
 								    {LEMMA: "degrés", ORTH: "d°"},
 								    {LEMMA: "saint", ORTH: "St."},
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    {LEMMA: "sainte", ORTH: "Ste."},
 								]:
-												Tidy up tokenizer exceptions

											
										
										
											2017-11-01 22:02:45 +00:00
+								    _exc[exc_data[ORTH]] = [exc_data]
-												Reorganise French language data

											
										
										
											2017-05-08 13:49:05 +00:00
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								for orth in ["etc."]:
-												Reorganise French language data

											
										
										
											2017-05-08 13:49:05 +00:00
+								    _exc[orth] = [{ORTH: orth}]
 								for verb, verb_lemma in [
 								    ("a", "avoir"),
 								    ("est", "être"),
 								    ("semble", "sembler"),
 								    ("indique", "indiquer"),
 								    ("moque", "moquer"),
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    ("passe", "passer"),
 								]:
-												Reorganise French language data

											
										
										
											2017-05-08 13:49:05 +00:00
+								    for orth in [verb, verb.title()]:
 								        for pronoun in ["elle", "il", "on"]:
 								            token = "{}-t-{}".format(orth, pronoun)
 								            _exc[token] = [
-												Improvement of rules now title insentive and have same declaration format

											
										
										
											2017-04-27 08:23:56 +00:00
+								                {LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"},
-												Reorganise French language data

											
										
										
											2017-05-08 13:49:05 +00:00
+								                {LEMMA: "t", ORTH: "-t"},
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								                {LEMMA: pronoun, ORTH: "-" + pronoun},
 								            ]
-												Reorganise French language data

											
										
										
											2017-05-08 13:49:05 +00:00
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								for verb, verb_lemma in [("est", "être")]:
-												Reorganise French language data

											
										
										
											2017-05-08 13:49:05 +00:00
+								    for orth in [verb, verb.title()]:
 								        token = "{}-ce".format(orth)
 								        _exc[token] = [
 								            {LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"},
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								            {LEMMA: "ce", ORTH: "-ce"},
 								        ]
-												Reorganise French language data

											
										
										
											2017-05-08 13:49:05 +00:00
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								for pre, pre_lemma in [("qu'", "que"), ("n'", "ne")]:
 								    for orth in [pre, pre.title()]:
 								        _exc["%sest-ce" % orth] = [
-												Reorganise French language data

											
										
										
											2017-05-08 13:49:05 +00:00
+								            {LEMMA: pre_lemma, ORTH: orth, TAG: "ADV"},
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								            {LEMMA: "être", ORTH: "est", TAG: "VERB"},
 								            {LEMMA: "ce", ORTH: "-ce"},
 								        ]
-												Reorganise French language data

											
										
										
											2017-05-08 13:49:05 +00:00
 								_infixes_exc = []
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								orig_elision = "'"
-												Merge branch 'master' into develop

											
										
										
											2018-12-18 12:48:10 +00:00
+								orig_hyphen = "-"
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
 								# loop through the elison and hyphen characters, and try to substitute the ones that weren't used in the original list
 								for infix in FR_BASE_EXCEPTIONS:
 								    variants_infix = {infix}
 								    for elision_char in [x for x in ELISION if x != orig_elision]:
-												Merge branch 'master' into develop

											
										
										
											2018-12-18 12:48:10 +00:00
+								        variants_infix.update(
 								            [word.replace(orig_elision, elision_char) for word in variants_infix]
 								        )
 								    for hyphen_char in [x for x in ["-", "‐"] if x != orig_hyphen]:
 								        variants_infix.update(
 								            [word.replace(orig_hyphen, hyphen_char) for word in variants_infix]
 								        )
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    variants_infix.update([upper_first_letter(word) for word in variants_infix])
 								    _infixes_exc.extend(variants_infix)
-												Reorganise French language data

											
										
										
											2017-05-08 13:49:05 +00:00
-												Fix typo

											
										
										
											2017-05-08 14:11:45 +00:00
+								for orth in _infixes_exc:
-												Reorganise French language data

											
										
										
											2017-05-08 13:49:05 +00:00
+								    _exc[orth] = [{ORTH: orth}]
 								_hyphen_prefix = [
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "a[ée]ro",
 								    "abat",
 								    "a[fg]ro",
 								    "after",
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "aigues?",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "am[ée]ricano",
 								    "anglo",
 								    "anti",
 								    "apr[èe]s",
 								    "arabo",
 								    "arcs?",
 								    "archi",
 								    "arrières?",
-												Clean up of char classes, few tokenizer fixes and faster default French tokenizer (#3293)

* splitting up latin unicode interval

* removing hyphen as infix for French

* adding failing test for issue 1235

* test for issue #3002 which now works

* partial fix for issue #2070

* keep the hyphen as infix for French (as it was)

* restore french expressions with hyphen as infix (as it was)

* added succeeding unit test for Issue #2656

* Fix issue #2822 with custom Italian exception

* Fix issue #2926 by allowing numbers right before infix /

* splitting up latin unicode interval

* removing hyphen as infix for French

* adding failing test for issue 1235

* test for issue #3002 which now works

* partial fix for issue #2070

* keep the hyphen as infix for French (as it was)

* restore french expressions with hyphen as infix (as it was)

* added succeeding unit test for Issue #2656

* Fix issue #2822 with custom Italian exception

* Fix issue #2926 by allowing numbers right before infix /

* remove duplicate

* remove xfail for Issue #2179 fixed by Matt

* adjust documentation and remove reference to regex lib

											
										
										
											2019-02-20 21:10:13 +00:00
+								    "audio",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "avant",
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "avion",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "auto",
 								    "banc",
 								    "bas(?:ses?)?",
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "bateaux?",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "bec?",
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "belles?",
 								    "beau",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "best",
 								    "bio?",
 								    "bien",
 								    "blanc",
 								    "bo[îi]te",
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "bonn?e?s?",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "bois",
 								    "bou(?:c|rg)",
 								    "b[êe]ta",
 								    "cache",
 								    "cap(?:ello)?",
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "casse",
 								    "castel",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "champ",
 								    "chapelle",
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "ch[âa]teau(?:neuf)?",
 								    "chasse",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "cha(?:ud|t)e?s?",
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "chauffe",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "chou",
 								    "chromo",
 								    "claire?s?",
 								    "co(?:de|ca)?",
 								    "compte",
 								    "contre",
 								    "cordon",
 								    "coupe?",
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "courte?s?",
 								    "couvre",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "crash",
 								    "crise",
 								    "croche",
 								    "cross",
 								    "cyber",
 								    "côte",
 								    "demi",
 								    "di(?:sney)?",
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "dix",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "d[ée]s?",
 								    "dys",
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "ex?",
 								    "émirato",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "entre",
 								    "est",
 								    "ethno",
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "ex",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "extra",
 								    "extrême",
 								    "[ée]co",
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "faux",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "fil",
 								    "fort",
 								    "franco?s?",
 								    "gallo",
 								    "gardes?",
 								    "gastro",
 								    "grande?",
 								    "gratte",
 								    "gr[ée]co",
 								    "gros",
 								    "g[ée]o",
 								    "haute?s?",
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "homm?es?",
 								    "hors",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "hyper",
 								    "indo",
 								    "infra",
 								    "inter",
 								    "intra",
 								    "islamo",
 								    "italo",
 								    "jean",
 								    "labio",
 								    "latino",
 								    "live",
 								    "lot",
 								    "louis",
 								    "m[ai]cro",
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "mal",
 								    "médio",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "mesnil",
 								    "mi(?:ni)?",
 								    "mono",
 								    "mont?s?",
 								    "moyen",
 								    "multi",
 								    "m[ée]cano",
 								    "m[ée]dico",
 								    "m[ée]do",
 								    "m[ée]ta",
 								    "mots?",
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "neuro",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "noix",
 								    "non",
 								    "nord",
 								    "notre",
 								    "n[ée]o",
 								    "ouest",
 								    "outre",
 								    "ouvre",
 								    "passe",
 								    "perce",
 								    "pharmaco",
 								    "ph[oy]to",
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "pieds?",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "pique",
 								    "poissons?",
 								    "ponce",
 								    "pont",
 								    "po[rs]t",
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "pousse",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "primo",
 								    "pro(?:cès|to)?",
 								    "pare",
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "petite?s?",
 								    "plessis",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "porte",
 								    "pré",
 								    "prêchi",
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "protège",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "pseudo",
 								    "pêle",
 								    "péri",
 								    "puy",
 								    "quasi",
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "quatre",
 								    "radio",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "recourt",
 								    "rythmo",
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "(?:re)?doubles?",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "r[ée]",
 								    "r[ée]tro",
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "requin",
 								    "sans?",
 								    "sa?inte?s?",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "semi",
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "serre",
 								    "sino",
 								    "socio",
 								    "sociale?s?",
 								    "soixante",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "sous",
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "su[bdrs]",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "super",
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "taille",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "tire",
 								    "thermo",
 								    "tiers",
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "tourne",
 								    "toute?s?",
 								    "tra[iî]ne?",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "trans",
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "trente",
 								    "trois",
 								    "trousse",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "tr(?:i|ou)",
 								    "t[ée]l[ée]",
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "utéro",
 								    "vaso",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "vi[cd]e",
 								    "vid[ée]o",
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "vie(?:ux|i?lles?|i?l)",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "vill(?:e|eneuve|ers|ette|iers|y)",
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "vingt",
 								    "voitures?",
 								    "wagons?",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "ultra",
 								    "à",
 								    "[ée]lectro",
 								    "[ée]qui",
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    "Fontaine",
 								    "La Chapelle",
 								    "Marie",
 								    "Le Mesnil",
 								    "Neuville",
 								    "Pierre",
 								    "Val",
 								    "Vaux",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								]
-												Reorganise French language data

											
										
										
											2017-05-08 13:49:05 +00:00
+								_regular_exp = [
-												Replacing regex library with re to increase tokenization speed (#3218)

* replace unicode categories with raw list of code points

* simplifying ranges

* fixing variable length quotes

* removing redundant regular expression

* small cleanup of regexp notations

* quotes and alpha as ranges instead of alterations

* removed most regexp dependencies and features

* exponential backtracking - unit tests

* rewrote expression with pathological backtracking

* disabling double hyphen tests for now

* test additional variants of repeating punctuation

* remove regex and redundant backslashes from load_reddit script

* small typo fixes

* disable double punctuation test for russian

* clean up old comments

* format block code

* final cleanup

* naming consistency

* french strings as unicode for python 2 support

* french regular expression case insensitive

											
										
										
											2019-02-01 07:05:22 +00:00
+								    "^a[{hyphen}]sexualis[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^arginine[{hyphen}]méthyl[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^binge[{hyphen}]watch[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^black[{hyphen}]out[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^bouche[{hyphen}]por[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^burn[{hyphen}]out[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^by[{hyphen}]pass[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^ch[{elision}]tiis[{al}]+$".format(elision=ELISION, al=ALPHA_LOWER),
 								    "^chape[{hyphen}]chut[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^down[{hyphen}]load[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^[ée]tats[{hyphen}]uni[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^droits?[{hyphen}]de[{hyphen}]l'homm[{al}]+$".format(
 								        hyphen=HYPHENS, al=ALPHA_LOWER
-												Merge branch 'master' into develop

											
										
										
											2018-12-18 12:48:10 +00:00
+								    ),
-												Replacing regex library with re to increase tokenization speed (#3218)

* replace unicode categories with raw list of code points

* simplifying ranges

* fixing variable length quotes

* removing redundant regular expression

* small cleanup of regexp notations

* quotes and alpha as ranges instead of alterations

* removed most regexp dependencies and features

* exponential backtracking - unit tests

* rewrote expression with pathological backtracking

* disabling double hyphen tests for now

* test additional variants of repeating punctuation

* remove regex and redundant backslashes from load_reddit script

* small typo fixes

* disable double punctuation test for russian

* clean up old comments

* format block code

* final cleanup

* naming consistency

* french strings as unicode for python 2 support

* french regular expression case insensitive

											
										
										
											2019-02-01 07:05:22 +00:00
+								    "^fac[{hyphen}]simil[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^fleur[{hyphen}]bleuis[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^flic[{hyphen}]flaqu[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^fox[{hyphen}]trott[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^google[{hyphen}]is[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^hard[{hyphen}]discount[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^hip[{hyphen}]hop[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^jet[{hyphen}]set[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^knock[{hyphen}]out[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^lèche[{hyphen}]bott[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^litho[{hyphen}]typographi[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^lock[{hyphen}]out[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^lombri[{hyphen}]compost[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^mac[{hyphen}]adamis[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^marque[{hyphen}]pag[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^mouton[{hyphen}]noiris[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^new[{hyphen}]york[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^pair[{hyphen}]programm[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^people[{hyphen}]is[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^plan[{hyphen}]socialis[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^premier[{hyphen}]ministr[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^prud[{elision}]hom[{al}]+$".format(elision=ELISION, al=ALPHA_LOWER),
 								    "^réarc[{hyphen}]bout[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^refox[{hyphen}]trott[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^remicro[{hyphen}]ond[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^repique[{hyphen}]niqu[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^repetit[{hyphen}]déjeun[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^rick[{hyphen}]roll[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^rond[{hyphen}]ponn[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^shift[{hyphen}]cliqu[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^soudo[{hyphen}]bras[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^stabilo[{hyphen}]boss[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^strip[{hyphen}]teas[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^terra[{hyphen}]form[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^teuf[{hyphen}]teuf[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^yo[{hyphen}]yo[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^zig[{hyphen}]zag[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
 								    "^z[{elision}]yeut[{al}]+$".format(elision=ELISION, al=ALPHA_LOWER),
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								]
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								# catching cases like faux-vampire
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								_regular_exp += [
-												Improve Catalan tokenization accuracy (#3225)

* small hyphen clean up for French

* catalan infix similar to french

											
										
										
											2019-02-04 09:37:19 +00:00
+								    "^{prefix}[{hyphen}][{al}][{hyphen}{al}{elision}]*$".format(
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								        prefix=p,
-												Improve Catalan tokenization accuracy (#3225)

* small hyphen clean up for French

* catalan infix similar to french

											
										
										
											2019-02-04 09:37:19 +00:00
+								        hyphen=HYPHENS,   # putting the - first in the [] range avoids having to use a backslash
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								        elision=ELISION,
-												Replacing regex library with re to increase tokenization speed (#3218)

* replace unicode categories with raw list of code points

* simplifying ranges

* fixing variable length quotes

* removing redundant regular expression

* small cleanup of regexp notations

* quotes and alpha as ranges instead of alterations

* removed most regexp dependencies and features

* exponential backtracking - unit tests

* rewrote expression with pathological backtracking

* disabling double hyphen tests for now

* test additional variants of repeating punctuation

* remove regex and redundant backslashes from load_reddit script

* small typo fixes

* disable double punctuation test for russian

* clean up old comments

* format block code

* final cleanup

* naming consistency

* french strings as unicode for python 2 support

* french regular expression case insensitive

											
										
										
											2019-02-01 07:05:22 +00:00
+								        al=ALPHA_LOWER,
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    )
 								    for p in _hyphen_prefix
 								]
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
 								# catching cases like entr'abat
-												Merge branch 'master' into develop

											
										
										
											2018-12-18 12:48:10 +00:00
+								_elision_prefix = ["r?é?entr", "grande?s?", "r"]
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								_regular_exp += [
-												Improve Catalan tokenization accuracy (#3225)

* small hyphen clean up for French

* catalan infix similar to french

											
										
										
											2019-02-04 09:37:19 +00:00
+								    "^{prefix}[{elision}][{al}][{hyphen}{al}{elision}]*$".format(
 								        prefix=p, elision=ELISION, hyphen=HYPHENS, al=ALPHA_LOWER
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    )
 								    for p in _elision_prefix
 								]
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
 								# catching cases like saut-de-ski, pet-en-l'air
-												Merge branch 'master' into develop

											
										
										
											2018-12-18 12:48:10 +00:00
+								_hyphen_combination = [
 								    "l[èe]s?",
 								    "la",
 								    "en",
 								    "des?",
 								    "d[eu]",
 								    "sur",
 								    "sous",
 								    "aux?",
 								    "à",
 								    "et",
 								    "près",
 								    "saint",
 								]
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								_regular_exp += [
-												Replacing regex library with re to increase tokenization speed (#3218)

* replace unicode categories with raw list of code points

* simplifying ranges

* fixing variable length quotes

* removing redundant regular expression

* small cleanup of regexp notations

* quotes and alpha as ranges instead of alterations

* removed most regexp dependencies and features

* exponential backtracking - unit tests

* rewrote expression with pathological backtracking

* disabling double hyphen tests for now

* test additional variants of repeating punctuation

* remove regex and redundant backslashes from load_reddit script

* small typo fixes

* disable double punctuation test for russian

* clean up old comments

* format block code

* final cleanup

* naming consistency

* french strings as unicode for python 2 support

* french regular expression case insensitive

											
										
										
											2019-02-01 07:05:22 +00:00
+								    "^[{a}]+[{hyphen}]{hyphen_combo}[{hyphen}](?:l[{elision}])?[{a}]+$".format(
 								        hyphen_combo=hc, elision=ELISION, hyphen=HYPHENS, a=ALPHA
-												French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023

											
										
										
											2018-12-16 17:04:55 +00:00
+								    )
 								    for hc in _hyphen_combination
 								]
 								# URLs
-												Rename _URL_PATTERN to URL_PATTERN

											
										
										
											2017-05-08 22:00:00 +00:00
+								_regular_exp.append(URL_PATTERN)
-												Reorganise French language data

											
										
										
											2017-05-08 13:49:05 +00:00
-												Don't copy exception dicts if not necessary and tidy up

											
										
										
											2017-10-31 20:05:29 +00:00
+								TOKENIZER_EXCEPTIONS = _exc
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								TOKEN_MATCH = re.compile(
 								    "|".join("(?:{})".format(m) for m in _regular_exp), re.IGNORECASE
 								).match