2017-10-14 12:59:52 +00:00
|
|
|
# coding: utf8
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
|
|
|
2019-09-27 09:57:27 +00:00
|
|
|
# Source: https://github.com/taranjeet/hindi-tokenizer/blob/master/stopwords.txt, https://data.mendeley.com/datasets/bsr3frvvjc/1#file-a21d5092-99d7-45d8-b044-3ae9edd391c6
|
|
|
|
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
STOP_WORDS = set(
|
|
|
|
"""
|
2017-11-13 11:43:49 +00:00
|
|
|
अंदर
|
2017-10-14 12:59:52 +00:00
|
|
|
अत
|
2017-11-13 11:43:49 +00:00
|
|
|
अदि
|
|
|
|
अप
|
2017-10-14 12:59:52 +00:00
|
|
|
अपना
|
2017-11-13 11:43:49 +00:00
|
|
|
अपनि
|
2017-10-14 12:59:52 +00:00
|
|
|
अपनी
|
|
|
|
अपने
|
2017-11-13 11:43:49 +00:00
|
|
|
अभि
|
2017-10-14 12:59:52 +00:00
|
|
|
अभी
|
|
|
|
अंदर
|
|
|
|
आदि
|
|
|
|
आप
|
2019-09-27 09:57:27 +00:00
|
|
|
अगर
|
2017-11-13 11:43:49 +00:00
|
|
|
इंहिं
|
|
|
|
इंहें
|
|
|
|
इंहों
|
|
|
|
इतयादि
|
2017-10-14 12:59:52 +00:00
|
|
|
इत्यादि
|
|
|
|
इन
|
|
|
|
इनका
|
|
|
|
इन्हीं
|
|
|
|
इन्हें
|
|
|
|
इन्हों
|
|
|
|
इस
|
|
|
|
इसका
|
2017-11-13 11:43:49 +00:00
|
|
|
इसकि
|
2017-10-14 12:59:52 +00:00
|
|
|
इसकी
|
|
|
|
इसके
|
|
|
|
इसमें
|
2017-11-13 11:43:49 +00:00
|
|
|
इसि
|
2017-10-14 12:59:52 +00:00
|
|
|
इसी
|
|
|
|
इसे
|
2017-11-13 11:43:49 +00:00
|
|
|
उंहिं
|
|
|
|
उंहें
|
|
|
|
उंहों
|
2017-10-14 12:59:52 +00:00
|
|
|
उन
|
|
|
|
उनका
|
2017-11-13 11:43:49 +00:00
|
|
|
उनकि
|
2017-10-14 12:59:52 +00:00
|
|
|
उनकी
|
|
|
|
उनके
|
|
|
|
उनको
|
|
|
|
उन्हीं
|
|
|
|
उन्हें
|
|
|
|
उन्हों
|
|
|
|
उस
|
|
|
|
उसके
|
2017-11-13 11:43:49 +00:00
|
|
|
उसि
|
2017-10-14 12:59:52 +00:00
|
|
|
उसी
|
|
|
|
उसे
|
|
|
|
एक
|
|
|
|
एवं
|
|
|
|
एस
|
2017-11-13 11:43:49 +00:00
|
|
|
एसे
|
2017-10-14 12:59:52 +00:00
|
|
|
ऐसे
|
2017-11-13 11:43:49 +00:00
|
|
|
ओर
|
2017-10-14 12:59:52 +00:00
|
|
|
और
|
2017-11-13 11:43:49 +00:00
|
|
|
कइ
|
2017-10-14 12:59:52 +00:00
|
|
|
कई
|
|
|
|
कर
|
|
|
|
करता
|
|
|
|
करते
|
|
|
|
करना
|
|
|
|
करने
|
|
|
|
करें
|
|
|
|
कहते
|
|
|
|
कहा
|
|
|
|
का
|
2017-11-13 11:43:49 +00:00
|
|
|
काफि
|
2017-10-14 12:59:52 +00:00
|
|
|
काफ़ी
|
|
|
|
कि
|
2017-11-13 11:43:49 +00:00
|
|
|
किंहें
|
|
|
|
किंहों
|
2017-10-14 12:59:52 +00:00
|
|
|
कितना
|
|
|
|
किन्हें
|
|
|
|
किन्हों
|
|
|
|
किया
|
|
|
|
किर
|
|
|
|
किस
|
2017-11-13 11:43:49 +00:00
|
|
|
किसि
|
2017-10-14 12:59:52 +00:00
|
|
|
किसी
|
|
|
|
किसे
|
|
|
|
की
|
|
|
|
कुछ
|
|
|
|
कुल
|
|
|
|
के
|
|
|
|
को
|
2017-11-13 11:43:49 +00:00
|
|
|
कोइ
|
2017-10-14 12:59:52 +00:00
|
|
|
कोई
|
2017-11-13 11:43:49 +00:00
|
|
|
कोन
|
|
|
|
कोनसा
|
2017-10-14 12:59:52 +00:00
|
|
|
कौन
|
|
|
|
कौनसा
|
|
|
|
गया
|
|
|
|
घर
|
|
|
|
जब
|
|
|
|
जहाँ
|
2017-11-13 11:43:49 +00:00
|
|
|
जहां
|
2017-10-14 12:59:52 +00:00
|
|
|
जा
|
2017-11-13 11:43:49 +00:00
|
|
|
जिंहें
|
|
|
|
जिंहों
|
2017-10-14 12:59:52 +00:00
|
|
|
जितना
|
2017-11-13 11:43:49 +00:00
|
|
|
जिधर
|
2017-10-14 12:59:52 +00:00
|
|
|
जिन
|
|
|
|
जिन्हें
|
|
|
|
जिन्हों
|
|
|
|
जिस
|
|
|
|
जिसे
|
|
|
|
जीधर
|
2017-11-13 11:43:49 +00:00
|
|
|
जेसा
|
|
|
|
जेसे
|
2017-10-14 12:59:52 +00:00
|
|
|
जैसा
|
|
|
|
जैसे
|
|
|
|
जो
|
|
|
|
तक
|
|
|
|
तब
|
|
|
|
तरह
|
2017-11-13 11:43:49 +00:00
|
|
|
तिंहें
|
|
|
|
तिंहों
|
2017-10-14 12:59:52 +00:00
|
|
|
तिन
|
|
|
|
तिन्हें
|
|
|
|
तिन्हों
|
|
|
|
तिस
|
|
|
|
तिसे
|
|
|
|
तो
|
|
|
|
था
|
2017-11-13 11:43:49 +00:00
|
|
|
थि
|
2017-10-14 12:59:52 +00:00
|
|
|
थी
|
|
|
|
थे
|
|
|
|
दबारा
|
2017-11-13 11:43:49 +00:00
|
|
|
दवारा
|
2017-10-14 12:59:52 +00:00
|
|
|
दिया
|
|
|
|
दुसरा
|
2017-11-13 11:43:49 +00:00
|
|
|
दुसरे
|
2017-10-14 12:59:52 +00:00
|
|
|
दूसरे
|
|
|
|
दो
|
|
|
|
द्वारा
|
|
|
|
न
|
2017-11-13 11:43:49 +00:00
|
|
|
नहिं
|
2017-10-14 12:59:52 +00:00
|
|
|
नहीं
|
|
|
|
ना
|
2017-11-13 11:43:49 +00:00
|
|
|
निचे
|
2017-10-14 12:59:52 +00:00
|
|
|
निहायत
|
|
|
|
नीचे
|
|
|
|
ने
|
|
|
|
पर
|
|
|
|
पहले
|
2017-11-13 11:43:49 +00:00
|
|
|
पुरा
|
2017-10-14 12:59:52 +00:00
|
|
|
पूरा
|
|
|
|
पे
|
|
|
|
फिर
|
2017-11-13 11:43:49 +00:00
|
|
|
बनि
|
2017-10-14 12:59:52 +00:00
|
|
|
बनी
|
2017-11-13 11:43:49 +00:00
|
|
|
बहि
|
2017-10-14 12:59:52 +00:00
|
|
|
बही
|
|
|
|
बहुत
|
|
|
|
बाद
|
|
|
|
बाला
|
|
|
|
बिलकुल
|
2017-11-13 11:43:49 +00:00
|
|
|
भि
|
|
|
|
भितर
|
2017-10-14 12:59:52 +00:00
|
|
|
भी
|
|
|
|
भीतर
|
|
|
|
मगर
|
|
|
|
मानो
|
|
|
|
मे
|
|
|
|
में
|
2019-09-27 09:57:27 +00:00
|
|
|
मैं
|
|
|
|
मुझको
|
|
|
|
मेरा
|
2017-10-14 12:59:52 +00:00
|
|
|
यदि
|
|
|
|
यह
|
|
|
|
यहाँ
|
2017-11-13 11:43:49 +00:00
|
|
|
यहां
|
|
|
|
यहि
|
2017-10-14 12:59:52 +00:00
|
|
|
यही
|
|
|
|
या
|
|
|
|
यिह
|
|
|
|
ये
|
|
|
|
रखें
|
2017-11-13 11:43:49 +00:00
|
|
|
रवासा
|
2017-10-14 12:59:52 +00:00
|
|
|
रहा
|
|
|
|
रहे
|
|
|
|
ऱ्वासा
|
|
|
|
लिए
|
|
|
|
लिये
|
|
|
|
लेकिन
|
|
|
|
व
|
2017-11-13 11:43:49 +00:00
|
|
|
वगेरह
|
2017-10-14 12:59:52 +00:00
|
|
|
वग़ैरह
|
2017-11-13 11:43:49 +00:00
|
|
|
वरग
|
2017-10-14 12:59:52 +00:00
|
|
|
वर्ग
|
|
|
|
वह
|
|
|
|
वहाँ
|
2017-11-13 11:43:49 +00:00
|
|
|
वहां
|
|
|
|
वहिं
|
2017-10-14 12:59:52 +00:00
|
|
|
वहीं
|
|
|
|
वाले
|
|
|
|
वुह
|
|
|
|
वे
|
2017-11-13 11:43:49 +00:00
|
|
|
वग़ैरह
|
|
|
|
संग
|
2017-10-14 12:59:52 +00:00
|
|
|
सकता
|
|
|
|
सकते
|
|
|
|
सबसे
|
2017-11-13 11:43:49 +00:00
|
|
|
सभि
|
2017-10-14 12:59:52 +00:00
|
|
|
सभी
|
|
|
|
साथ
|
|
|
|
साबुत
|
|
|
|
साभ
|
|
|
|
सारा
|
|
|
|
से
|
|
|
|
सो
|
|
|
|
संग
|
2017-11-13 11:43:49 +00:00
|
|
|
हि
|
2017-10-14 12:59:52 +00:00
|
|
|
ही
|
2017-11-13 11:43:49 +00:00
|
|
|
हुअ
|
2017-10-14 12:59:52 +00:00
|
|
|
हुआ
|
2017-11-13 11:43:49 +00:00
|
|
|
हुइ
|
2017-10-14 12:59:52 +00:00
|
|
|
हुई
|
|
|
|
हुए
|
2017-11-13 11:43:49 +00:00
|
|
|
हे
|
|
|
|
हें
|
2017-10-14 12:59:52 +00:00
|
|
|
है
|
|
|
|
हैं
|
|
|
|
हो
|
2019-09-27 09:57:27 +00:00
|
|
|
हूँ
|
2017-10-14 12:59:52 +00:00
|
|
|
होता
|
2017-11-13 11:43:49 +00:00
|
|
|
होति
|
2017-10-14 12:59:52 +00:00
|
|
|
होती
|
|
|
|
होते
|
|
|
|
होना
|
|
|
|
होने
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
""".split()
|
|
|
|
)
|