2023-06-14 15:48:41 +00:00
|
|
|
from ...symbols import NORM, ORTH
|
2020-07-22 20:18:46 +00:00
|
|
|
from ...util import update_exc
|
2023-06-14 15:48:41 +00:00
|
|
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
2017-05-08 13:48:31 +00:00
|
|
|
|
|
|
|
_exc = {}
|
|
|
|
|
2017-01-31 22:27:29 +00:00
|
|
|
|
|
|
|
# Source https://www.cs.tut.fi/~jkorpela/kielenopas/5.5.html
|
2017-05-08 13:48:31 +00:00
|
|
|
for exc_data in [
|
2020-07-22 21:09:01 +00:00
|
|
|
{ORTH: "aik."},
|
|
|
|
{ORTH: "alk."},
|
|
|
|
{ORTH: "alv."},
|
|
|
|
{ORTH: "ark."},
|
|
|
|
{ORTH: "as."},
|
|
|
|
{ORTH: "eaa."},
|
|
|
|
{ORTH: "ed."},
|
|
|
|
{ORTH: "esim."},
|
|
|
|
{ORTH: "huom."},
|
|
|
|
{ORTH: "jne."},
|
|
|
|
{ORTH: "joht."},
|
|
|
|
{ORTH: "k."},
|
|
|
|
{ORTH: "ks."},
|
|
|
|
{ORTH: "lk."},
|
|
|
|
{ORTH: "lkm."},
|
|
|
|
{ORTH: "lyh."},
|
|
|
|
{ORTH: "läh."},
|
|
|
|
{ORTH: "miel."},
|
|
|
|
{ORTH: "milj."},
|
|
|
|
{ORTH: "Mm."},
|
|
|
|
{ORTH: "mm."},
|
|
|
|
{ORTH: "myöh."},
|
|
|
|
{ORTH: "n."},
|
|
|
|
{ORTH: "nimim."},
|
|
|
|
{ORTH: "n:o"},
|
|
|
|
{ORTH: "N:o"},
|
|
|
|
{ORTH: "nro"},
|
|
|
|
{ORTH: "ns."},
|
|
|
|
{ORTH: "nyk."},
|
|
|
|
{ORTH: "oik."},
|
|
|
|
{ORTH: "os."},
|
|
|
|
{ORTH: "p."},
|
|
|
|
{ORTH: "par."},
|
|
|
|
{ORTH: "per."},
|
|
|
|
{ORTH: "pj."},
|
|
|
|
{ORTH: "puh.joht."},
|
|
|
|
{ORTH: "prof."},
|
|
|
|
{ORTH: "puh."},
|
|
|
|
{ORTH: "pvm."},
|
|
|
|
{ORTH: "rak."},
|
|
|
|
{ORTH: "ry."},
|
|
|
|
{ORTH: "s."},
|
|
|
|
{ORTH: "siht."},
|
|
|
|
{ORTH: "synt."},
|
|
|
|
{ORTH: "t."},
|
|
|
|
{ORTH: "tark."},
|
|
|
|
{ORTH: "til."},
|
|
|
|
{ORTH: "tms."},
|
|
|
|
{ORTH: "toim."},
|
|
|
|
{ORTH: "v."},
|
|
|
|
{ORTH: "vas."},
|
|
|
|
{ORTH: "vast."},
|
|
|
|
{ORTH: "vrt."},
|
|
|
|
{ORTH: "yht."},
|
|
|
|
{ORTH: "yl."},
|
|
|
|
{ORTH: "ym."},
|
|
|
|
{ORTH: "yms."},
|
|
|
|
{ORTH: "yo."},
|
|
|
|
{ORTH: "yliopp."},
|
|
|
|
{ORTH: "ao."},
|
|
|
|
{ORTH: "em."},
|
|
|
|
{ORTH: "ko."},
|
|
|
|
{ORTH: "ml."},
|
|
|
|
{ORTH: "po."},
|
|
|
|
{ORTH: "so."},
|
|
|
|
{ORTH: "ts."},
|
|
|
|
{ORTH: "vm."},
|
|
|
|
{ORTH: "srk."},
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
]:
|
2017-11-01 22:02:45 +00:00
|
|
|
_exc[exc_data[ORTH]] = [exc_data]
|
2017-05-08 13:48:31 +00:00
|
|
|
|
2021-06-16 08:56:47 +00:00
|
|
|
# Source: https://kaino.kotus.fi/visk/sisallys.php?p=141
|
|
|
|
conj_contraction_bases = [
|
2021-06-28 09:48:00 +00:00
|
|
|
("ett", "että"),
|
|
|
|
("jott", "jotta"),
|
|
|
|
("kosk", "koska"),
|
|
|
|
("mutt", "mutta"),
|
|
|
|
("vaikk", "vaikka"),
|
|
|
|
("ehk", "ehkä"),
|
|
|
|
("miks", "miksi"),
|
|
|
|
("siks", "siksi"),
|
|
|
|
("joll", "jos"),
|
|
|
|
("ell", "jos"),
|
2021-06-16 08:56:47 +00:00
|
|
|
]
|
|
|
|
conj_contraction_negations = [
|
2021-06-28 09:48:00 +00:00
|
|
|
("en", "en"),
|
|
|
|
("et", "et"),
|
|
|
|
("ei", "ei"),
|
|
|
|
("emme", "emme"),
|
|
|
|
("ette", "ette"),
|
|
|
|
("eivat", "eivät"),
|
|
|
|
("eivät", "eivät"),
|
|
|
|
]
|
2021-06-16 08:56:47 +00:00
|
|
|
for (base_lower, base_norm) in conj_contraction_bases:
|
|
|
|
for base in [base_lower, base_lower.title()]:
|
|
|
|
for (suffix, suffix_norm) in conj_contraction_negations:
|
2021-06-28 09:48:00 +00:00
|
|
|
_exc[base + suffix] = [
|
|
|
|
{ORTH: base, NORM: base_norm},
|
|
|
|
{ORTH: suffix, NORM: suffix_norm},
|
|
|
|
]
|
2017-05-08 13:48:31 +00:00
|
|
|
|
2020-07-22 20:18:46 +00:00
|
|
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|