spaCy/spacy/lang/el/tokenizer_exceptions.py

from ...symbols import NORM, ORTH
from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS

_exc = {}

for token in ["Απ'", "ΑΠ'", "αφ'", "Αφ'"]:
    _exc[token] = [{ORTH: token, NORM: "από"}]

for token in ["Αλλ'", "αλλ'"]:
    _exc[token] = [{ORTH: token, NORM: "αλλά"}]

for token in ["παρ'", "Παρ'", "ΠΑΡ'"]:
    _exc[token] = [{ORTH: token, NORM: "παρά"}]

for token in ["καθ'", "Καθ'"]:
    _exc[token] = [{ORTH: token, NORM: "κάθε"}]

for token in ["κατ'", "Κατ'"]:
    _exc[token] = [{ORTH: token, NORM: "κατά"}]

for token in ["'ΣΟΥΝ", "'ναι", "'ταν", "'τανε", "'μαστε", "'μουνα", "'μουν"]:
    _exc[token] = [{ORTH: token, NORM: "είμαι"}]

for token in ["Επ'", "επ'", "εφ'", "Εφ'"]:
    _exc[token] = [{ORTH: token, NORM: "επί"}]

for token in ["Δι'", "δι'"]:
    _exc[token] = [{ORTH: token, NORM: "δια"}]

for token in ["'χουν", "'χουμε", "'χαμε", "'χα", "'χε", "'χεις", "'χει"]:
    _exc[token] = [{ORTH: token, NORM: "έχω"}]

for token in ["υπ'", "Υπ'"]:
    _exc[token] = [{ORTH: token, NORM: "υπό"}]

for token in ["Μετ'", "ΜΕΤ'", "'μετ"]:
    _exc[token] = [{ORTH: token, NORM: "μετά"}]

for token in ["Μ'", "μ'"]:
    _exc[token] = [{ORTH: token, NORM: "με"}]

for token in ["Γι'", "ΓΙ'", "γι'"]:
    _exc[token] = [{ORTH: token, NORM: "για"}]

for token in ["Σ'", "σ'"]:
    _exc[token] = [{ORTH: token, NORM: "σε"}]

for token in ["Θ'", "θ'"]:
    _exc[token] = [{ORTH: token, NORM: "θα"}]

for token in ["Ν'", "ν'"]:
    _exc[token] = [{ORTH: token, NORM: "να"}]

for token in ["Τ'", "τ'"]:
    _exc[token] = [{ORTH: token, NORM: "να"}]

for token in ["'γω", "'σένα", "'μεις"]:
    _exc[token] = [{ORTH: token, NORM: "εγώ"}]

for token in ["Τ'", "τ'"]:
    _exc[token] = [{ORTH: token, NORM: "το"}]

for token in ["Φέρ'", "Φερ'", "φέρ'", "φερ'"]:
    _exc[token] = [{ORTH: token, NORM: "φέρνω"}]

for token in ["'ρθούνε", "'ρθουν", "'ρθει", "'ρθεί", "'ρθε", "'ρχεται"]:
    _exc[token] = [{ORTH: token, NORM: "έρχομαι"}]

for token in ["'πανε", "'λεγε", "'λεγαν", "'πε", "'λεγα"]:
    _exc[token] = [{ORTH: token, NORM: "λέγω"}]

for token in ["Πάρ'", "πάρ'"]:
    _exc[token] = [{ORTH: token, NORM: "παίρνω"}]

for token in ["μέσ'", "Μέσ'", "μεσ'"]:
    _exc[token] = [{ORTH: token, NORM: "μέσα"}]

for token in ["Δέσ'", "Δεσ'", "δεσ'"]:
    _exc[token] = [{ORTH: token, NORM: "δένω"}]

for token in ["'κανε", "Κάν'"]:
    _exc[token] = [{ORTH: token, NORM: "κάνω"}]

_other_exc = {
    "κι": [{ORTH: "κι", NORM: "και"}],
    "Παίξ'": [{ORTH: "Παίξ'", NORM: "παίζω"}],
    "Αντ'": [{ORTH: "Αντ'", NORM: "αντί"}],
    "ολ'": [{ORTH: "ολ'", NORM: "όλος"}],
    "ύστερ'": [{ORTH: "ύστερ'", NORM: "ύστερα"}],
    "'πρεπε": [{ORTH: "'πρεπε", NORM: "πρέπει"}],
    "Δύσκολ'": [{ORTH: "Δύσκολ'", NORM: "δύσκολος"}],
    "'θελα": [{ORTH: "'θελα", NORM: "θέλω"}],
    "'γραφα": [{ORTH: "'γραφα", NORM: "γράφω"}],
    "'παιρνα": [{ORTH: "'παιρνα", NORM: "παίρνω"}],
    "'δειξε": [{ORTH: "'δειξε", NORM: "δείχνω"}],
    "όμουρφ'": [{ORTH: "όμουρφ'", NORM: "όμορφος"}],
    "κ'τσή": [{ORTH: "κ'τσή", NORM: "κουτσός"}],
    "μηδ'": [{ORTH: "μηδ'", NORM: "μήδε"}],
    "'ξομολογήθηκε": [{ORTH: "'ξομολογήθηκε", NORM: "εξομολογούμαι"}],
    "'μας": [{ORTH: "'μας", NORM: "εμάς"}],
    "'ξερες": [{ORTH: "'ξερες", NORM: "ξέρω"}],
    "έφθασ'": [{ORTH: "έφθασ'", NORM: "φθάνω"}],
    "εξ'": [{ORTH: "εξ'", NORM: "εκ"}],
    "δώσ'": [{ORTH: "δώσ'", NORM: "δίνω"}],
    "τίποτ'": [{ORTH: "τίποτ'", NORM: "τίποτα"}],
    "Λήξ'": [{ORTH: "Λήξ'", NORM: "λήγω"}],
    "άσ'": [{ORTH: "άσ'", NORM: "αφήνω"}],
    "Στ'": [{ORTH: "Στ'", NORM: "στο"}],
    "Δωσ'": [{ORTH: "Δωσ'", NORM: "δίνω"}],
    "Βάψ'": [{ORTH: "Βάψ'", NORM: "βάφω"}],
    "Αλλ'": [{ORTH: "Αλλ'", NORM: "αλλά"}],
    "Αμ'": [{ORTH: "Αμ'", NORM: "άμα"}],
    "Αγόρασ'": [{ORTH: "Αγόρασ'", NORM: "αγοράζω"}],
    "'φύγε": [{ORTH: "'φύγε", NORM: "φεύγω"}],
    "'φερε": [{ORTH: "'φερε", NORM: "φέρνω"}],
    "'φαγε": [{ORTH: "'φαγε", NORM: "τρώω"}],
    "'σπαγαν": [{ORTH: "'σπαγαν", NORM: "σπάω"}],
    "'σκασε": [{ORTH: "'σκασε", NORM: "σκάω"}],
    "'σβηνε": [{ORTH: "'σβηνε", NORM: "σβήνω"}],
    "'ριξε": [{ORTH: "'ριξε", NORM: "ρίχνω"}],
    "'κλεβε": [{ORTH: "'κλεβε", NORM: "κλέβω"}],
    "'κει": [{ORTH: "'κει", NORM: "εκεί"}],
    "'βλεπε": [{ORTH: "'βλεπε", NORM: "βλέπω"}],
    "'βγαινε": [{ORTH: "'βγαινε", NORM: "βγαίνω"}],
}

_exc.update(_other_exc)

for h in range(1, 12 + 1):

    for period in ["π.μ.", "πμ"]:
        _exc[f"{h}{period}"] = [
            {ORTH: f"{h}"},
            {ORTH: period, NORM: "π.μ."},
        ]

    for period in ["μ.μ.", "μμ"]:
        _exc[f"{h}{period}"] = [
            {ORTH: f"{h}"},
            {ORTH: period, NORM: "μ.μ."},
        ]

for exc_data in [
    {ORTH: "ΑΓΡ.", NORM: "Αγροτικός"},
    {ORTH: "Αγ. Γρ.", NORM: "Αγία Γραφή"},
    {ORTH: "Αθ.", NORM: "Αθανάσιος"},
    {ORTH: "Αλεξ.", NORM: "Αλέξανδρος"},
    {ORTH: "Απρ.", NORM: "Απρίλιος"},
    {ORTH: "Αύγ.", NORM: "Αύγουστος"},
    {ORTH: "Δεκ.", NORM: "Δεκέμβριος"},
    {ORTH: "Δημ.", NORM: "Δήμος"},
    {ORTH: "Ιαν.", NORM: "Ιανουάριος"},
    {ORTH: "Ιούλ.", NORM: "Ιούλιος"},
    {ORTH: "Ιούν.", NORM: "Ιούνιος"},
    {ORTH: "Ιωαν.", NORM: "Ιωάννης"},
    {ORTH: "Μ. Ασία", NORM: "Μικρά Ασία"},
    {ORTH: "Μάρτ.", NORM: "Μάρτιος"},
    {ORTH: "Μάρτ'", NORM: "Μάρτιος"},
    {ORTH: "Νοέμβρ.", NORM: "Νοέμβριος"},
    {ORTH: "Οκτ.", NORM: "Οκτώβριος"},
    {ORTH: "Σεπτ.", NORM: "Σεπτέμβριος"},
    {ORTH: "Φεβρ.", NORM: "Φεβρουάριος"},
]:
    _exc[exc_data[ORTH]] = [exc_data]

for orth in [
    "$ΗΠΑ",
    "Α'",
    "Α.Ε.",
    "Α.Ε.Β.Ε.",
    "Α.Ε.Ι.",
    "Α.Ε.Π.",
    "Α.Μ.Α.",
    "Α.Π.Θ.",
    "Α.Τ.",
    "Α.Χ.",
    "ΑΝ.",
    "Αγ.",
    "Αλ.",
    "Αν.",
    "Αντ.",
    "Απ.",
    "Β'",
    "Β)",
    "Β.Ζ.",
    "Β.Ι.Ο.",
    "Β.Κ.",
    "Β.Μ.Α.",
    "Βασ.",
    "Γ'",
    "Γ)",
    "Γ.Γ.",
    "Γ.Δ.",
    "Γκ.",
    "Δ.Ε.Η.",
    "Δ.Ε.Σ.Ε.",
    "Δ.Ν.",
    "Δ.Ο.Υ.",
    "Δ.Σ.",
    "Δ.Υ.",
    "ΔΙ.ΚΑ.Τ.Σ.Α.",
    "Δηλ.",
    "Διον.",
    "Ε.Α.",
    "Ε.Α.Κ.",
    "Ε.Α.Π.",
    "Ε.Ε.",
    "Ε.Κ.",
    "Ε.ΚΕ.ΠΙΣ.",
    "Ε.Λ.Α.",
    "Ε.Λ.Ι.Α.",
    "Ε.Π.Σ.",
    "Ε.Π.Τ.Α.",
    "Ε.Σ.Ε.Ε.Κ.",
    "Ε.Υ.Κ.",
    "ΕΕ.",
    "ΕΚ.",
    "ΕΛ.",
    "ΕΛ.ΑΣ.",
    "Εθν.",
    "Ελ.",
    "Εμ.",
    "Επ.",
    "Ευ.",
    "Η'",
    "Η.Π.Α.",
    "ΘΕ.",
    "Θεμ.",
    "Θεοδ.",
    "Θρ.",
    "Ι.Ε.Κ.",
    "Ι.Κ.Α.",
    "Ι.Κ.Υ.",
    "Ι.Σ.Θ.",
    "Ι.Χ.",
    "ΙΖ'",
    "ΙΧ.",
    "Κ.Α.Α.",
    "Κ.Α.Ε.",
    "Κ.Β.Σ.",
    "Κ.Δ.",
    "Κ.Ε.",
    "Κ.Ε.Κ.",
    "Κ.Ι.",
    "Κ.Κ.",
    "Κ.Ι.Θ.",
    "Κ.Ι.Θ.",
    "Κ.ΚΕΚ.",
    "Κ.Ο.",
    "Κ.Π.Ρ.",
    "ΚΑΤ.",
    "ΚΚ.",
    "Καν.",
    "Καρ.",
    "Κατ.",
    "Κυρ.",
    "Κων.",
    "Λ.Α.",
    "Λ.χ.",
    "Λ.Χ.",
    "Λεωφ.",
    "Λι.",
    "Μ.Δ.Ε.",
    "Μ.Ε.Ο.",
    "Μ.Ζ.",
    "Μ.Μ.Ε.",
    "Μ.Ο.",
    "Μεγ.",
    "Μιλτ.",
    "Μιχ.",
    "Ν.Δ.",
    "Ν.Ε.Α.",
    "Ν.Κ.",
    "Ν.Ο.",
    "Ν.Ο.Θ.",
    "Ν.Π.Δ.Δ.",
    "Ν.Υ.",
    "ΝΔ.",
    "Νικ.",
    "Ντ'",
    "Ντ.",
    "Ο'",
    "Ο.Α.",
    "Ο.Α.Ε.Δ.",
    "Ο.Δ.",
    "Ο.Ε.Ε.",
    "Ο.Ε.Ε.Κ.",
    "Ο.Η.Ε.",
    "Ο.Κ.",
    "Π.Δ.",
    "Π.Ε.Κ.Δ.Υ.",
    "Π.Ε.Π.",
    "Π.Μ.Σ.",
    "ΠΟΛ.",
    "Π.Χ.",
    "Παρ.",
    "Πλ.",
    "Πρ.",
    "Σ.Δ.Ο.Ε.",
    "Σ.Ε.",
    "Σ.Ε.Κ.",
    "Σ.Π.Δ.Ω.Β.",
    "Σ.Τ.",
    "Σαβ.",
    "Στ.",
    "ΣτΕ.",
    "Στρ.",
    "Τ.Α.",
    "Τ.Ε.Ε.",
    "Τ.Ε.Ι.",
    "ΤΡ.",
    "Τζ.",
    "Τηλ.",
    "Υ.Γ.",
    "ΥΓ.",
    "ΥΠ.Ε.Π.Θ.",
    "Φ.Α.Β.Ε.",
    "Φ.Κ.",
    "Φ.Σ.",
    "Φ.Χ.",
    "Φ.Π.Α.",
    "Φιλ.",
    "Χ.Α.Α.",
    "ΧΡ.",
    "Χ.Χ.",
    "Χαρ.",
    "Χιλ.",
    "Χρ.",
    "άγ.",
    "άρθρ.",
    "αι.",
    "αν.",
    "απ.",
    "αρ.",
    "αριθ.",
    "αριθμ.",
    "β'",
    "βλ.",
    "γ.γ.",
    "γεν.",
    "γραμμ.",
    "δ.δ.",
    "δ.σ.",
    "δηλ.",
    "δισ.",
    "δολ.",
    "δρχ.",
    "εκ.",
    "εκατ.",
    "ελ.",
    "θιν'",
    "κ.",
    "κ.ά.",
    "κ.α.",
    "κ.κ.",
    "κ.λπ.",
    "κ.ο.κ.",
    "κ.τ.λ.",
    "κλπ.",
    "κτλ.",
    "κυβ.",
    "λ.χ.",
    "μ.",
    "μ.Χ.",
    "μ.μ.",
    "μιλ.",
    "ντ'",
    "π.Χ.",
    "π.β.",
    "π.δ.",
    "π.μ.",
    "π.χ.",
    "σ.",
    "σ.α.λ.",
    "σ.σ.",
    "σελ.",
    "στρ.",
    "τ'ς",
    "τ.μ.",
    "τετ.",
    "τετρ.",
    "τηλ.",
    "τρισ.",
    "τόν.",
    "υπ.",
    "χ.μ.",
    "χγρ.",
    "χιλ.",
    "χλμ.",
]:
    _exc[orth] = [{ORTH: orth}]

TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
-												Configure isort to use the Black profile, recursively isort the `spacy` module (#12721)

* Use isort with Black profile

* isort all the things

* Fix import cycles as a result of import sorting

* Add DOCBIN_ALL_ATTRS type definition

* Add isort to requirements

* Remove isort from build dependencies check

* Typo
											
										
										
											2023-06-14 15:48:41 +00:00
+								from ...symbols import NORM, ORTH
-												Tidy up and move noun_chunks, token_match, url_match

											
										
										
											2020-07-22 20:18:46 +00:00
+								from ...util import update_exc
-												Configure isort to use the Black profile, recursively isort the `spacy` module (#12721)

* Use isort with Black profile

* isort all the things

* Fix import cycles as a result of import sorting

* Add DOCBIN_ALL_ATTRS type definition

* Add isort to requirements

* Remove isort from build dependencies check

* Typo
											
										
										
											2023-06-14 15:48:41 +00:00
+								from ..tokenizer_exceptions import BASE_EXCEPTIONS
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
+								_exc = {}
 								for token in ["Απ'", "ΑΠ'", "αφ'", "Αφ'"]:
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-22 21:09:01 +00:00
+								    _exc[token] = [{ORTH: token, NORM: "από"}]
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
 								for token in ["Αλλ'", "αλλ'"]:
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-22 21:09:01 +00:00
+								    _exc[token] = [{ORTH: token, NORM: "αλλά"}]
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
 								for token in ["παρ'", "Παρ'", "ΠΑΡ'"]:
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-22 21:09:01 +00:00
+								    _exc[token] = [{ORTH: token, NORM: "παρά"}]
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
 								for token in ["καθ'", "Καθ'"]:
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-22 21:09:01 +00:00
+								    _exc[token] = [{ORTH: token, NORM: "κάθε"}]
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
 								for token in ["κατ'", "Κατ'"]:
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-22 21:09:01 +00:00
+								    _exc[token] = [{ORTH: token, NORM: "κατά"}]
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
 								for token in ["'ΣΟΥΝ", "'ναι", "'ταν", "'τανε", "'μαστε", "'μουνα", "'μουν"]:
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-22 21:09:01 +00:00
+								    _exc[token] = [{ORTH: token, NORM: "είμαι"}]
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
 								for token in ["Επ'", "επ'", "εφ'", "Εφ'"]:
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-22 21:09:01 +00:00
+								    _exc[token] = [{ORTH: token, NORM: "επί"}]
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
 								for token in ["Δι'", "δι'"]:
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-22 21:09:01 +00:00
+								    _exc[token] = [{ORTH: token, NORM: "δια"}]
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
 								for token in ["'χουν", "'χουμε", "'χαμε", "'χα", "'χε", "'χεις", "'χει"]:
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-22 21:09:01 +00:00
+								    _exc[token] = [{ORTH: token, NORM: "έχω"}]
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
 								for token in ["υπ'", "Υπ'"]:
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-22 21:09:01 +00:00
+								    _exc[token] = [{ORTH: token, NORM: "υπό"}]
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
 								for token in ["Μετ'", "ΜΕΤ'", "'μετ"]:
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-22 21:09:01 +00:00
+								    _exc[token] = [{ORTH: token, NORM: "μετά"}]
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
 								for token in ["Μ'", "μ'"]:
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-22 21:09:01 +00:00
+								    _exc[token] = [{ORTH: token, NORM: "με"}]
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
 								for token in ["Γι'", "ΓΙ'", "γι'"]:
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-22 21:09:01 +00:00
+								    _exc[token] = [{ORTH: token, NORM: "για"}]
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
 								for token in ["Σ'", "σ'"]:
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-22 21:09:01 +00:00
+								    _exc[token] = [{ORTH: token, NORM: "σε"}]
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
 								for token in ["Θ'", "θ'"]:
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-22 21:09:01 +00:00
+								    _exc[token] = [{ORTH: token, NORM: "θα"}]
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
 								for token in ["Ν'", "ν'"]:
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-22 21:09:01 +00:00
+								    _exc[token] = [{ORTH: token, NORM: "να"}]
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
 								for token in ["Τ'", "τ'"]:
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-22 21:09:01 +00:00
+								    _exc[token] = [{ORTH: token, NORM: "να"}]
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
 								for token in ["'γω", "'σένα", "'μεις"]:
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-22 21:09:01 +00:00
+								    _exc[token] = [{ORTH: token, NORM: "εγώ"}]
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
 								for token in ["Τ'", "τ'"]:
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-22 21:09:01 +00:00
+								    _exc[token] = [{ORTH: token, NORM: "το"}]
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
 								for token in ["Φέρ'", "Φερ'", "φέρ'", "φερ'"]:
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-22 21:09:01 +00:00
+								    _exc[token] = [{ORTH: token, NORM: "φέρνω"}]
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
 								for token in ["'ρθούνε", "'ρθουν", "'ρθει", "'ρθεί", "'ρθε", "'ρχεται"]:
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-22 21:09:01 +00:00
+								    _exc[token] = [{ORTH: token, NORM: "έρχομαι"}]
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
 								for token in ["'πανε", "'λεγε", "'λεγαν", "'πε", "'λεγα"]:
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-22 21:09:01 +00:00
+								    _exc[token] = [{ORTH: token, NORM: "λέγω"}]
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
 								for token in ["Πάρ'", "πάρ'"]:
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-22 21:09:01 +00:00
+								    _exc[token] = [{ORTH: token, NORM: "παίρνω"}]
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
 								for token in ["μέσ'", "Μέσ'", "μεσ'"]:
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-22 21:09:01 +00:00
+								    _exc[token] = [{ORTH: token, NORM: "μέσα"}]
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
 								for token in ["Δέσ'", "Δεσ'", "δεσ'"]:
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-22 21:09:01 +00:00
+								    _exc[token] = [{ORTH: token, NORM: "δένω"}]
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
 								for token in ["'κανε", "Κάν'"]:
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-22 21:09:01 +00:00
+								    _exc[token] = [{ORTH: token, NORM: "κάνω"}]
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
 								_other_exc = {
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-22 21:09:01 +00:00
+								    "κι": [{ORTH: "κι", NORM: "και"}],
 								    "Παίξ'": [{ORTH: "Παίξ'", NORM: "παίζω"}],
 								    "Αντ'": [{ORTH: "Αντ'", NORM: "αντί"}],
 								    "ολ'": [{ORTH: "ολ'", NORM: "όλος"}],
 								    "ύστερ'": [{ORTH: "ύστερ'", NORM: "ύστερα"}],
 								    "'πρεπε": [{ORTH: "'πρεπε", NORM: "πρέπει"}],
 								    "Δύσκολ'": [{ORTH: "Δύσκολ'", NORM: "δύσκολος"}],
 								    "'θελα": [{ORTH: "'θελα", NORM: "θέλω"}],
 								    "'γραφα": [{ORTH: "'γραφα", NORM: "γράφω"}],
 								    "'παιρνα": [{ORTH: "'παιρνα", NORM: "παίρνω"}],
 								    "'δειξε": [{ORTH: "'δειξε", NORM: "δείχνω"}],
 								    "όμουρφ'": [{ORTH: "όμουρφ'", NORM: "όμορφος"}],
 								    "κ'τσή": [{ORTH: "κ'τσή", NORM: "κουτσός"}],
 								    "μηδ'": [{ORTH: "μηδ'", NORM: "μήδε"}],
 								    "'ξομολογήθηκε": [{ORTH: "'ξομολογήθηκε", NORM: "εξομολογούμαι"}],
 								    "'μας": [{ORTH: "'μας", NORM: "εμάς"}],
 								    "'ξερες": [{ORTH: "'ξερες", NORM: "ξέρω"}],
 								    "έφθασ'": [{ORTH: "έφθασ'", NORM: "φθάνω"}],
 								    "εξ'": [{ORTH: "εξ'", NORM: "εκ"}],
 								    "δώσ'": [{ORTH: "δώσ'", NORM: "δίνω"}],
 								    "τίποτ'": [{ORTH: "τίποτ'", NORM: "τίποτα"}],
 								    "Λήξ'": [{ORTH: "Λήξ'", NORM: "λήγω"}],
 								    "άσ'": [{ORTH: "άσ'", NORM: "αφήνω"}],
 								    "Στ'": [{ORTH: "Στ'", NORM: "στο"}],
 								    "Δωσ'": [{ORTH: "Δωσ'", NORM: "δίνω"}],
 								    "Βάψ'": [{ORTH: "Βάψ'", NORM: "βάφω"}],
 								    "Αλλ'": [{ORTH: "Αλλ'", NORM: "αλλά"}],
 								    "Αμ'": [{ORTH: "Αμ'", NORM: "άμα"}],
 								    "Αγόρασ'": [{ORTH: "Αγόρασ'", NORM: "αγοράζω"}],
 								    "'φύγε": [{ORTH: "'φύγε", NORM: "φεύγω"}],
 								    "'φερε": [{ORTH: "'φερε", NORM: "φέρνω"}],
 								    "'φαγε": [{ORTH: "'φαγε", NORM: "τρώω"}],
 								    "'σπαγαν": [{ORTH: "'σπαγαν", NORM: "σπάω"}],
 								    "'σκασε": [{ORTH: "'σκασε", NORM: "σκάω"}],
 								    "'σβηνε": [{ORTH: "'σβηνε", NORM: "σβήνω"}],
 								    "'ριξε": [{ORTH: "'ριξε", NORM: "ρίχνω"}],
 								    "'κλεβε": [{ORTH: "'κλεβε", NORM: "κλέβω"}],
 								    "'κει": [{ORTH: "'κει", NORM: "εκεί"}],
 								    "'βλεπε": [{ORTH: "'βλεπε", NORM: "βλέπω"}],
 								    "'βγαινε": [{ORTH: "'βγαινε", NORM: "βγαίνω"}],
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
+								}
 								_exc.update(_other_exc)
 								for h in range(1, 12 + 1):
 								    for period in ["π.μ.", "πμ"]:
-												More formatting changes

											
										
										
											2019-12-25 16:59:52 +00:00
+								        _exc[f"{h}{period}"] = [
 								            {ORTH: f"{h}"},
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-22 21:09:01 +00:00
+								            {ORTH: period, NORM: "π.μ."},
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								        ]
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
 								    for period in ["μ.μ.", "μμ"]:
-												More formatting changes

											
										
										
											2019-12-25 16:59:52 +00:00
+								        _exc[f"{h}{period}"] = [
 								            {ORTH: f"{h}"},
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-22 21:09:01 +00:00
+								            {ORTH: period, NORM: "μ.μ."},
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								        ]
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
 								for exc_data in [
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-22 21:09:01 +00:00
+								    {ORTH: "ΑΓΡ.", NORM: "Αγροτικός"},
 								    {ORTH: "Αγ. Γρ.", NORM: "Αγία Γραφή"},
 								    {ORTH: "Αθ.", NORM: "Αθανάσιος"},
 								    {ORTH: "Αλεξ.", NORM: "Αλέξανδρος"},
 								    {ORTH: "Απρ.", NORM: "Απρίλιος"},
 								    {ORTH: "Αύγ.", NORM: "Αύγουστος"},
 								    {ORTH: "Δεκ.", NORM: "Δεκέμβριος"},
 								    {ORTH: "Δημ.", NORM: "Δήμος"},
 								    {ORTH: "Ιαν.", NORM: "Ιανουάριος"},
 								    {ORTH: "Ιούλ.", NORM: "Ιούλιος"},
 								    {ORTH: "Ιούν.", NORM: "Ιούνιος"},
 								    {ORTH: "Ιωαν.", NORM: "Ιωάννης"},
 								    {ORTH: "Μ. Ασία", NORM: "Μικρά Ασία"},
 								    {ORTH: "Μάρτ.", NORM: "Μάρτιος"},
 								    {ORTH: "Μάρτ'", NORM: "Μάρτιος"},
 								    {ORTH: "Νοέμβρ.", NORM: "Νοέμβριος"},
 								    {ORTH: "Οκτ.", NORM: "Οκτώβριος"},
 								    {ORTH: "Σεπτ.", NORM: "Σεπτέμβριος"},
 								    {ORTH: "Φεβρ.", NORM: "Φεβρουάριος"},
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
+								]:
 								    _exc[exc_data[ORTH]] = [exc_data]
 								for orth in [
 								    "$ΗΠΑ",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "Α'",
 								    "Α.Ε.",
 								    "Α.Ε.Β.Ε.",
 								    "Α.Ε.Ι.",
 								    "Α.Ε.Π.",
 								    "Α.Μ.Α.",
 								    "Α.Π.Θ.",
 								    "Α.Τ.",
 								    "Α.Χ.",
 								    "ΑΝ.",
 								    "Αγ.",
 								    "Αλ.",
 								    "Αν.",
 								    "Αντ.",
 								    "Απ.",
 								    "Β'",
 								    "Β)",
 								    "Β.Ζ.",
 								    "Β.Ι.Ο.",
 								    "Β.Κ.",
 								    "Β.Μ.Α.",
 								    "Βασ.",
 								    "Γ'",
 								    "Γ)",
 								    "Γ.Γ.",
 								    "Γ.Δ.",
 								    "Γκ.",
 								    "Δ.Ε.Η.",
 								    "Δ.Ε.Σ.Ε.",
 								    "Δ.Ν.",
 								    "Δ.Ο.Υ.",
 								    "Δ.Σ.",
 								    "Δ.Υ.",
 								    "ΔΙ.ΚΑ.Τ.Σ.Α.",
 								    "Δηλ.",
 								    "Διον.",
 								    "Ε.Α.",
 								    "Ε.Α.Κ.",
 								    "Ε.Α.Π.",
 								    "Ε.Ε.",
 								    "Ε.Κ.",
 								    "Ε.ΚΕ.ΠΙΣ.",
 								    "Ε.Λ.Α.",
 								    "Ε.Λ.Ι.Α.",
 								    "Ε.Π.Σ.",
 								    "Ε.Π.Τ.Α.",
 								    "Ε.Σ.Ε.Ε.Κ.",
 								    "Ε.Υ.Κ.",
 								    "ΕΕ.",
 								    "ΕΚ.",
 								    "ΕΛ.",
 								    "ΕΛ.ΑΣ.",
 								    "Εθν.",
 								    "Ελ.",
 								    "Εμ.",
 								    "Επ.",
 								    "Ευ.",
 								    "Η'",
 								    "Η.Π.Α.",
 								    "ΘΕ.",
 								    "Θεμ.",
 								    "Θεοδ.",
 								    "Θρ.",
 								    "Ι.Ε.Κ.",
 								    "Ι.Κ.Α.",
 								    "Ι.Κ.Υ.",
 								    "Ι.Σ.Θ.",
 								    "Ι.Χ.",
 								    "ΙΖ'",
 								    "ΙΧ.",
 								    "Κ.Α.Α.",
 								    "Κ.Α.Ε.",
 								    "Κ.Β.Σ.",
 								    "Κ.Δ.",
 								    "Κ.Ε.",
 								    "Κ.Ε.Κ.",
 								    "Κ.Ι.",
 								    "Κ.Κ.",
 								    "Κ.Ι.Θ.",
 								    "Κ.Ι.Θ.",
 								    "Κ.ΚΕΚ.",
 								    "Κ.Ο.",
 								    "Κ.Π.Ρ.",
 								    "ΚΑΤ.",
 								    "ΚΚ.",
 								    "Καν.",
 								    "Καρ.",
 								    "Κατ.",
 								    "Κυρ.",
 								    "Κων.",
 								    "Λ.Α.",
 								    "Λ.χ.",
 								    "Λ.Χ.",
 								    "Λεωφ.",
 								    "Λι.",
 								    "Μ.Δ.Ε.",
 								    "Μ.Ε.Ο.",
 								    "Μ.Ζ.",
 								    "Μ.Μ.Ε.",
 								    "Μ.Ο.",
 								    "Μεγ.",
 								    "Μιλτ.",
 								    "Μιχ.",
 								    "Ν.Δ.",
 								    "Ν.Ε.Α.",
 								    "Ν.Κ.",
 								    "Ν.Ο.",
 								    "Ν.Ο.Θ.",
 								    "Ν.Π.Δ.Δ.",
 								    "Ν.Υ.",
 								    "ΝΔ.",
 								    "Νικ.",
 								    "Ντ'",
 								    "Ντ.",
 								    "Ο'",
 								    "Ο.Α.",
 								    "Ο.Α.Ε.Δ.",
 								    "Ο.Δ.",
 								    "Ο.Ε.Ε.",
 								    "Ο.Ε.Ε.Κ.",
 								    "Ο.Η.Ε.",
 								    "Ο.Κ.",
 								    "Π.Δ.",
 								    "Π.Ε.Κ.Δ.Υ.",
 								    "Π.Ε.Π.",
 								    "Π.Μ.Σ.",
 								    "ΠΟΛ.",
 								    "Π.Χ.",
 								    "Παρ.",
 								    "Πλ.",
 								    "Πρ.",
 								    "Σ.Δ.Ο.Ε.",
 								    "Σ.Ε.",
 								    "Σ.Ε.Κ.",
 								    "Σ.Π.Δ.Ω.Β.",
 								    "Σ.Τ.",
 								    "Σαβ.",
 								    "Στ.",
 								    "ΣτΕ.",
 								    "Στρ.",
 								    "Τ.Α.",
 								    "Τ.Ε.Ε.",
 								    "Τ.Ε.Ι.",
 								    "ΤΡ.",
 								    "Τζ.",
 								    "Τηλ.",
 								    "Υ.Γ.",
 								    "ΥΓ.",
 								    "ΥΠ.Ε.Π.Θ.",
 								    "Φ.Α.Β.Ε.",
 								    "Φ.Κ.",
 								    "Φ.Σ.",
 								    "Φ.Χ.",
 								    "Φ.Π.Α.",
 								    "Φιλ.",
 								    "Χ.Α.Α.",
 								    "ΧΡ.",
 								    "Χ.Χ.",
 								    "Χαρ.",
 								    "Χιλ.",
 								    "Χρ.",
 								    "άγ.",
 								    "άρθρ.",
 								    "αι.",
 								    "αν.",
 								    "απ.",
 								    "αρ.",
 								    "αριθ.",
 								    "αριθμ.",
 								    "β'",
 								    "βλ.",
 								    "γ.γ.",
 								    "γεν.",
 								    "γραμμ.",
 								    "δ.δ.",
 								    "δ.σ.",
 								    "δηλ.",
 								    "δισ.",
 								    "δολ.",
 								    "δρχ.",
 								    "εκ.",
 								    "εκατ.",
 								    "ελ.",
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
+								    "θιν'",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "κ.",
 								    "κ.ά.",
 								    "κ.α.",
 								    "κ.κ.",
 								    "κ.λπ.",
 								    "κ.ο.κ.",
 								    "κ.τ.λ.",
 								    "κλπ.",
 								    "κτλ.",
 								    "κυβ.",
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
+								    "λ.χ.",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "μ.",
 								    "μ.Χ.",
 								    "μ.μ.",
 								    "μιλ.",
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
+								    "ντ'",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "π.Χ.",
 								    "π.β.",
 								    "π.δ.",
 								    "π.μ.",
 								    "π.χ.",
 								    "σ.",
 								    "σ.α.λ.",
 								    "σ.σ.",
 								    "σελ.",
 								    "στρ.",
 								    "τ'ς",
 								    "τ.μ.",
 								    "τετ.",
 								    "τετρ.",
 								    "τηλ.",
 								    "τρισ.",
 								    "τόν.",
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
+								    "υπ.",
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    "χ.μ.",
 								    "χγρ.",
 								    "χιλ.",
 								    "χλμ.",
-												Add support for Greek language (#2535)

* Add contributor agreement

* Support for Greek language

* Fix missing el_tokenizer

											
										
										
											2018-07-10 11:48:38 +00:00
+								]:
 								    _exc[orth] = [{ORTH: orth}]
-												Tidy up and move noun_chunks, token_match, url_match

											
										
										
											2020-07-22 20:18:46 +00:00
+								TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)