spaCy/spacy/lang/ga/tokenizer_exceptions.py

from ...symbols import POS, DET, ADP, CCONJ, ADV, NOUN, X, AUX
from ...symbols import ORTH, LEMMA, NORM


_exc = {
    "'acha'n": [
        {ORTH: "'ach", LEMMA: "gach", NORM: "gach", POS: DET},
        {ORTH: "a'n", LEMMA: "aon", NORM: "aon", POS: DET},
    ],
    "dem'": [
        {ORTH: "de", LEMMA: "de", NORM: "de", POS: ADP},
        {ORTH: "m'", LEMMA: "mo", NORM: "mo", POS: DET},
    ],
    "ded'": [
        {ORTH: "de", LEMMA: "de", NORM: "de", POS: ADP},
        {ORTH: "d'", LEMMA: "do", NORM: "do", POS: DET},
    ],
    "lem'": [
        {ORTH: "le", LEMMA: "le", NORM: "le", POS: ADP},
        {ORTH: "m'", LEMMA: "mo", NORM: "mo", POS: DET},
    ],
    "led'": [
        {ORTH: "le", LEMMA: "le", NORM: "le", POS: ADP},
        {ORTH: "d'", LEMMA: "mo", NORM: "do", POS: DET},
    ],
}

for exc_data in [
    {ORTH: "'gus", LEMMA: "agus", NORM: "agus", POS: CCONJ},
    {ORTH: "'ach", LEMMA: "gach", NORM: "gach", POS: DET},
    {ORTH: "ao'", LEMMA: "aon", NORM: "aon"},
    {ORTH: "'niar", LEMMA: "aniar", NORM: "aniar", POS: ADV},
    {ORTH: "'níos", LEMMA: "aníos", NORM: "aníos", POS: ADV},
    {ORTH: "'ndiu", LEMMA: "inniu", NORM: "inniu", POS: ADV},
    {ORTH: "'nocht", LEMMA: "anocht", NORM: "anocht", POS: ADV},
    {ORTH: "m'", LEMMA: "mo", POS: DET},
    {ORTH: "Aib.", LEMMA: "Aibreán", POS: NOUN},
    {ORTH: "Ath.", LEMMA: "athair", POS: NOUN},
    {ORTH: "Beal.", LEMMA: "Bealtaine", POS: NOUN},
    {ORTH: "a.C.n.", LEMMA: "ante Christum natum", POS: X},
    {ORTH: "m.sh.", LEMMA: "mar shampla", POS: ADV},
    {ORTH: "M.F.", LEMMA: "Meán Fómhair", POS: NOUN},
    {ORTH: "M.Fómh.", LEMMA: "Meán Fómhair", POS: NOUN},
    {ORTH: "D.F.", LEMMA: "Deireadh Fómhair", POS: NOUN},
    {ORTH: "D.Fómh.", LEMMA: "Deireadh Fómhair", POS: NOUN},
    {ORTH: "r.C.", LEMMA: "roimh Chríost", POS: ADV},
    {ORTH: "R.C.", LEMMA: "roimh Chríost", POS: ADV},
    {ORTH: "r.Ch.", LEMMA: "roimh Chríost", POS: ADV},
    {ORTH: "r.Chr.", LEMMA: "roimh Chríost", POS: ADV},
    {ORTH: "R.Ch.", LEMMA: "roimh Chríost", POS: ADV},
    {ORTH: "R.Chr.", LEMMA: "roimh Chríost", POS: ADV},
    {ORTH: "⁊rl.", LEMMA: "agus araile", POS: ADV},
    {ORTH: "srl.", LEMMA: "agus araile", POS: ADV},
    {ORTH: "Co.", LEMMA: "contae", POS: NOUN},
    {ORTH: "Ean.", LEMMA: "Eanáir", POS: NOUN},
    {ORTH: "Feab.", LEMMA: "Feabhra", POS: NOUN},
    {ORTH: "gCo.", LEMMA: "contae", POS: NOUN},
    {ORTH: ".i.", LEMMA: "eadhon", POS: ADV},
    {ORTH: "B'", LEMMA: "ba", POS: AUX},
    {ORTH: "b'", LEMMA: "ba", POS: AUX},
    {ORTH: "lch.", LEMMA: "leathanach", POS: NOUN},
    {ORTH: "Lch.", LEMMA: "leathanach", POS: NOUN},
    {ORTH: "lgh.", LEMMA: "leathanach", POS: NOUN},
    {ORTH: "Lgh.", LEMMA: "leathanach", POS: NOUN},
    {ORTH: "Lún.", LEMMA: "Lúnasa", POS: NOUN},
    {ORTH: "Már.", LEMMA: "Márta", POS: NOUN},
    {ORTH: "Meith.", LEMMA: "Meitheamh", POS: NOUN},
    {ORTH: "Noll.", LEMMA: "Nollaig", POS: NOUN},
    {ORTH: "Samh.", LEMMA: "Samhain", POS: NOUN},
    {ORTH: "tAth.", LEMMA: "athair", POS: NOUN},
    {ORTH: "tUas.", LEMMA: "Uasal", POS: NOUN},
    {ORTH: "teo.", LEMMA: "teoranta", POS: NOUN},
    {ORTH: "Teo.", LEMMA: "teoranta", POS: NOUN},
    {ORTH: "Uas.", LEMMA: "Uasal", POS: NOUN},
    {ORTH: "uimh.", LEMMA: "uimhir", POS: NOUN},
    {ORTH: "Uimh.", LEMMA: "uimhir", POS: NOUN},
]:
    _exc[exc_data[ORTH]] = [exc_data]

for orth in ["d'", "D'"]:
    _exc[orth] = [{ORTH: orth}]


TOKENIZER_EXCEPTIONS = _exc
Add missing symbols 2017-10-31 18:34:45 +00:00			`from ...symbols import POS, DET, ADP, CCONJ, ADV, NOUN, X, AUX`
			`from ...symbols import ORTH, LEMMA, NORM`
attempt a port from #1147 2017-06-26 20:24:55 +00:00

			`_exc = {`
			`"'acha'n": [`
add POS 2017-06-26 20:53:41 +00:00			`{ORTH: "'ach", LEMMA: "gach", NORM: "gach", POS: DET},`
💫 Tidy up and auto-format .py files (#2983) <!--- Provide a general summary of your changes in the title. --> ## Description - [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files. - [x] Update flake8 config to exclude very large files (lemmatization tables etc.) - [x] Update code to be compatible with flake8 rules - [x] Fix various small bugs, inconsistencies and messy stuff in the language data - [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means) Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results. At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information. ### Types of change enhancement, code style ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. 2018-11-30 16:03:03 +00:00			`{ORTH: "a'n", LEMMA: "aon", NORM: "aon", POS: DET},`
			`],`
attempt a port from #1147 2017-06-26 20:24:55 +00:00			`"dem'": [`
add POS 2017-06-26 20:53:41 +00:00			`{ORTH: "de", LEMMA: "de", NORM: "de", POS: ADP},`
💫 Tidy up and auto-format .py files (#2983) <!--- Provide a general summary of your changes in the title. --> ## Description - [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files. - [x] Update flake8 config to exclude very large files (lemmatization tables etc.) - [x] Update code to be compatible with flake8 rules - [x] Fix various small bugs, inconsistencies and messy stuff in the language data - [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means) Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results. At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information. ### Types of change enhancement, code style ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. 2018-11-30 16:03:03 +00:00			`{ORTH: "m'", LEMMA: "mo", NORM: "mo", POS: DET},`
			`],`
attempt a port from #1147 2017-06-26 20:24:55 +00:00			`"ded'": [`
add POS 2017-06-26 20:53:41 +00:00			`{ORTH: "de", LEMMA: "de", NORM: "de", POS: ADP},`
💫 Tidy up and auto-format .py files (#2983) <!--- Provide a general summary of your changes in the title. --> ## Description - [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files. - [x] Update flake8 config to exclude very large files (lemmatization tables etc.) - [x] Update code to be compatible with flake8 rules - [x] Fix various small bugs, inconsistencies and messy stuff in the language data - [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means) Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results. At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information. ### Types of change enhancement, code style ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. 2018-11-30 16:03:03 +00:00			`{ORTH: "d'", LEMMA: "do", NORM: "do", POS: DET},`
			`],`
attempt a port from #1147 2017-06-26 20:24:55 +00:00			`"lem'": [`
add POS 2017-06-26 20:53:41 +00:00			`{ORTH: "le", LEMMA: "le", NORM: "le", POS: ADP},`
💫 Tidy up and auto-format .py files (#2983) <!--- Provide a general summary of your changes in the title. --> ## Description - [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files. - [x] Update flake8 config to exclude very large files (lemmatization tables etc.) - [x] Update code to be compatible with flake8 rules - [x] Fix various small bugs, inconsistencies and messy stuff in the language data - [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means) Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results. At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information. ### Types of change enhancement, code style ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. 2018-11-30 16:03:03 +00:00			`{ORTH: "m'", LEMMA: "mo", NORM: "mo", POS: DET},`
			`],`
attempt a port from #1147 2017-06-26 20:24:55 +00:00			`"led'": [`
add POS 2017-06-26 20:53:41 +00:00			`{ORTH: "le", LEMMA: "le", NORM: "le", POS: ADP},`
💫 Tidy up and auto-format .py files (#2983) <!--- Provide a general summary of your changes in the title. --> ## Description - [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files. - [x] Update flake8 config to exclude very large files (lemmatization tables etc.) - [x] Update code to be compatible with flake8 rules - [x] Fix various small bugs, inconsistencies and messy stuff in the language data - [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means) Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results. At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information. ### Types of change enhancement, code style ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. 2018-11-30 16:03:03 +00:00			`{ORTH: "d'", LEMMA: "mo", NORM: "do", POS: DET},`
			`],`
attempt a port from #1147 2017-06-26 20:24:55 +00:00			`}`

			`for exc_data in [`
add POS 2017-06-26 20:53:41 +00:00			`{ORTH: "'gus", LEMMA: "agus", NORM: "agus", POS: CCONJ},`
			`{ORTH: "'ach", LEMMA: "gach", NORM: "gach", POS: DET},`
attempt a port from #1147 2017-06-26 20:24:55 +00:00			`{ORTH: "ao'", LEMMA: "aon", NORM: "aon"},`
add POS 2017-06-26 20:53:41 +00:00			`{ORTH: "'niar", LEMMA: "aniar", NORM: "aniar", POS: ADV},`
			`{ORTH: "'níos", LEMMA: "aníos", NORM: "aníos", POS: ADV},`
			`{ORTH: "'ndiu", LEMMA: "inniu", NORM: "inniu", POS: ADV},`
			`{ORTH: "'nocht", LEMMA: "anocht", NORM: "anocht", POS: ADV},`
			`{ORTH: "m'", LEMMA: "mo", POS: DET},`
			`{ORTH: "Aib.", LEMMA: "Aibreán", POS: NOUN},`
			`{ORTH: "Ath.", LEMMA: "athair", POS: NOUN},`
			`{ORTH: "Beal.", LEMMA: "Bealtaine", POS: NOUN},`
just now discovered that you can do multiwords 2017-06-26 21:19:39 +00:00			`{ORTH: "a.C.n.", LEMMA: "ante Christum natum", POS: X},`
			`{ORTH: "m.sh.", LEMMA: "mar shampla", POS: ADV},`
			`{ORTH: "M.F.", LEMMA: "Meán Fómhair", POS: NOUN},`
			`{ORTH: "M.Fómh.", LEMMA: "Meán Fómhair", POS: NOUN},`
			`{ORTH: "D.F.", LEMMA: "Deireadh Fómhair", POS: NOUN},`
			`{ORTH: "D.Fómh.", LEMMA: "Deireadh Fómhair", POS: NOUN},`
missed a couple 2017-06-26 21:24:14 +00:00			`{ORTH: "r.C.", LEMMA: "roimh Chríost", POS: ADV},`
just now discovered that you can do multiwords 2017-06-26 21:19:39 +00:00			`{ORTH: "R.C.", LEMMA: "roimh Chríost", POS: ADV},`
			`{ORTH: "r.Ch.", LEMMA: "roimh Chríost", POS: ADV},`
			`{ORTH: "r.Chr.", LEMMA: "roimh Chríost", POS: ADV},`
missed a couple 2017-06-26 21:24:14 +00:00			`{ORTH: "R.Ch.", LEMMA: "roimh Chríost", POS: ADV},`
			`{ORTH: "R.Chr.", LEMMA: "roimh Chríost", POS: ADV},`
just now discovered that you can do multiwords 2017-06-26 21:19:39 +00:00			`{ORTH: "⁊rl.", LEMMA: "agus araile", POS: ADV},`
			`{ORTH: "srl.", LEMMA: "agus araile", POS: ADV},`
add POS 2017-06-26 20:53:41 +00:00			`{ORTH: "Co.", LEMMA: "contae", POS: NOUN},`
			`{ORTH: "Ean.", LEMMA: "Eanáir", POS: NOUN},`
			`{ORTH: "Feab.", LEMMA: "Feabhra", POS: NOUN},`
			`{ORTH: "gCo.", LEMMA: "contae", POS: NOUN},`
			`{ORTH: ".i.", LEMMA: "eadhon", POS: ADV},`
b' 2017-06-27 21:42:16 +00:00			`{ORTH: "B'", LEMMA: "ba", POS: AUX},`
			`{ORTH: "b'", LEMMA: "ba", POS: AUX},`
add POS 2017-06-26 20:53:41 +00:00			`{ORTH: "lch.", LEMMA: "leathanach", POS: NOUN},`
			`{ORTH: "Lch.", LEMMA: "leathanach", POS: NOUN},`
			`{ORTH: "lgh.", LEMMA: "leathanach", POS: NOUN},`
			`{ORTH: "Lgh.", LEMMA: "leathanach", POS: NOUN},`
			`{ORTH: "Lún.", LEMMA: "Lúnasa", POS: NOUN},`
			`{ORTH: "Már.", LEMMA: "Márta", POS: NOUN},`
			`{ORTH: "Meith.", LEMMA: "Meitheamh", POS: NOUN},`
			`{ORTH: "Noll.", LEMMA: "Nollaig", POS: NOUN},`
			`{ORTH: "Samh.", LEMMA: "Samhain", POS: NOUN},`
			`{ORTH: "tAth.", LEMMA: "athair", POS: NOUN},`
			`{ORTH: "tUas.", LEMMA: "Uasal", POS: NOUN},`
			`{ORTH: "teo.", LEMMA: "teoranta", POS: NOUN},`
			`{ORTH: "Teo.", LEMMA: "teoranta", POS: NOUN},`
			`{ORTH: "Uas.", LEMMA: "Uasal", POS: NOUN},`
			`{ORTH: "uimh.", LEMMA: "uimhir", POS: NOUN},`
💫 Tidy up and auto-format .py files (#2983) <!--- Provide a general summary of your changes in the title. --> ## Description - [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files. - [x] Update flake8 config to exclude very large files (lemmatization tables etc.) - [x] Update code to be compatible with flake8 rules - [x] Fix various small bugs, inconsistencies and messy stuff in the language data - [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means) Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results. At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information. ### Types of change enhancement, code style ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. 2018-11-30 16:03:03 +00:00			`{ORTH: "Uimh.", LEMMA: "uimhir", POS: NOUN},`
			`]:`
Remove comma that caused list to wrap in tuple! Also removed extra dict wrappings for performance (we used to have them in there, but they should only really exist if copying the dict is absolutely necessary) 2017-10-31 19:13:16 +00:00			`_exc[exc_data[ORTH]] = [exc_data]`
attempt a port from #1147 2017-06-26 20:24:55 +00:00
💫 Tidy up and auto-format .py files (#2983) <!--- Provide a general summary of your changes in the title. --> ## Description - [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files. - [x] Update flake8 config to exclude very large files (lemmatization tables etc.) - [x] Update code to be compatible with flake8 rules - [x] Fix various small bugs, inconsistencies and messy stuff in the language data - [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means) Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results. At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information. ### Types of change enhancement, code style ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. 2018-11-30 16:03:03 +00:00			`for orth in ["d'", "D'"]:`
attempt a port from #1147 2017-06-26 20:24:55 +00:00			`_exc[orth] = [{ORTH: orth}]`


Remove comma that caused list to wrap in tuple! Also removed extra dict wrappings for performance (we used to have them in there, but they should only really exist if copying the dict is absolutely necessary) 2017-10-31 19:13:16 +00:00			`TOKENIZER_EXCEPTIONS = _exc`