diff --git a/spacy/bn/__init__.py b/spacy/bn/__init__.py index b1335a110..d47b32857 100644 --- a/spacy/bn/__init__.py +++ b/spacy/bn/__init__.py @@ -1,10 +1,16 @@ # coding: utf8 -from __future__ import unicode_literals, print_function +from __future__ import unicode_literals +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES +from .tag_map import TAG_MAP +from .stop_words import STOP_WORDS +from .lemmatizer import LEMMA_RULES + +from ..language_data import BASE_EXCEPTIONS from ..language import Language from ..attrs import LANG - -from .language_data import * +from ..util import update_exc class Bengali(Language): @@ -14,7 +20,7 @@ class Bengali(Language): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'bn' - tokenizer_exceptions = TOKENIZER_EXCEPTIONS + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tag_map = TAG_MAP stop_words = STOP_WORDS lemma_rules = LEMMA_RULES @@ -23,4 +29,5 @@ class Bengali(Language): suffixes = tuple(TOKENIZER_SUFFIXES) infixes = tuple(TOKENIZER_INFIXES) -EXPORT = Bengali \ No newline at end of file + +__all__ = ['Bengali'] diff --git a/spacy/bn/lemma_rules.py b/spacy/bn/lemmatizer.py similarity index 100% rename from spacy/bn/lemma_rules.py rename to spacy/bn/lemmatizer.py diff --git a/spacy/bn/morph_rules.py b/spacy/bn/morph_rules.py index efa5a6185..dda948e47 100644 --- a/spacy/bn/morph_rules.py +++ b/spacy/bn/morph_rules.py @@ -1,8 +1,9 @@ # coding: utf8 from __future__ import unicode_literals -from ..language_data import PRON_LEMMA -from ..symbols import * +from ..symbols import LEMMA +from ..deprecated import PRON_LEMMA + MORPH_RULES = { "PRP": { @@ -51,5 +52,5 @@ MORPH_RULES = { 'Case': 'Nom'}, 'তাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Poss': 'Yes', 'Case': 'Nom'}, - }, + } } diff --git a/spacy/bn/punctuation.py b/spacy/bn/punctuation.py index 3dd3e3a62..cd5ac7f1d 100644 --- a/spacy/bn/punctuation.py +++ b/spacy/bn/punctuation.py @@ -1,8 +1,10 @@ # coding: utf8 from __future__ import unicode_literals -from ..language_data.punctuation import ALPHA_LOWER, LIST_ELLIPSES, QUOTES, ALPHA_UPPER, LIST_QUOTES, UNITS, \ - CURRENCY, LIST_PUNCT, ALPHA, _QUOTES +from ..language_data.punctuation import ALPHA_LOWER, LIST_ELLIPSES, QUOTES +from ..language_data.punctuation import ALPHA_UPPER, LIST_QUOTES, UNITS +from ..language_data.punctuation import CURRENCY, LIST_PUNCT, ALPHA, _QUOTES + CURRENCY_SYMBOLS = r"\$ ¢ £ € ¥ ฿ ৳" @@ -42,4 +44,3 @@ TOKENIZER_INFIXES = ( r'(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])'.format(a=ALPHA, q=_QUOTES.replace("'", "").strip().replace(" ", "")), ] ) -__all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] diff --git a/spacy/bn/tag_map.py b/spacy/bn/tag_map.py index 2efae3a6a..a264e8d95 100644 --- a/spacy/bn/tag_map.py +++ b/spacy/bn/tag_map.py @@ -1,7 +1,8 @@ # coding: utf8 from __future__ import unicode_literals -from ..symbols import * +from ..symbols import POS, PUNCT, ADJ, CONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB +from ..symbols import CCONJ, NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, SYM TAG_MAP = { @@ -55,4 +56,22 @@ TAG_MAP = { "WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"}, "WRB": {POS: ADV, "PronType": "int|rel"}, "SP": {POS: SPACE}, + "ADV": {POS: ADV}, + "NOUN": {POS: NOUN}, + "ADP": {POS: ADP}, + "PRON": {POS: PRON}, + "SCONJ": {POS: SCONJ}, + "PROPN": {POS: PROPN}, + "DET": {POS: DET}, + "SYM": {POS: SYM}, + "INTJ": {POS: INTJ}, + "PUNCT": {POS: PUNCT}, + "NUM": {POS: NUM}, + "AUX": {POS: AUX}, + "X": {POS: X}, + "CONJ": {POS: CONJ}, + "CCONJ": {POS: CCONJ}, + "ADJ": {POS: ADJ}, + "VERB": {POS: VERB}, + "PART": {POS: PART}, }