added tag_map for indonesian (#3515)

* added tag_map for indonesian

* changed tag map from .py to .txt to see if tests pass

* added symbols import

* added utf8 encoding flag

* added missing SCONJ symbol

* Auto-format

* Remove unused imports

* Make tag map available in Indonesian defaults
This commit is contained in:
jeannefukumaru 2019-04-01 18:27:48 +08:00 committed by Ines Montani
parent c23e234d65
commit 6cdb7b2e04
2 changed files with 36 additions and 0 deletions

View File

@ -8,6 +8,7 @@ from .norm_exceptions import NORM_EXCEPTIONS
from .lemmatizer import LOOKUP from .lemmatizer import LOOKUP
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from .tag_map import TAG_MAP
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
@ -30,6 +31,7 @@ class IndonesianDefaults(Language.Defaults):
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
lemma_lookup = LOOKUP lemma_lookup = LOOKUP
tag_map = TAG_MAP
class Indonesian(Language): class Indonesian(Language):

34
spacy/lang/id/tag_map.py Normal file
View File

@ -0,0 +1,34 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import POS, PUNCT, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
from ...symbols import NOUN, PRON, AUX, SCONJ
# POS explanations for indonesian available from https://www.aclweb.org/anthology/Y12-1014
TAG_MAP = {
"NSD": {POS: NOUN},
"Z": {POS: PUNCT},
"VSA": {POS: VERB},
"CC-": {POS: NUM},
"R": {POS: ADP},
"D": {POS: ADV},
"ASP": {POS: ADJ},
"S": {POS: SCONJ},
"VSP": {POS: VERB},
"H": {POS: CCONJ},
"F": {POS: X},
"B": {POS: DET},
"CO-": {POS: NUM},
"G": {POS: ADV},
"PS3": {POS: PRON},
"W": {POS: ADV},
"O": {POS: AUX},
"PP1": {POS: PRON},
"ASS": {POS: ADJ},
"PS1": {POS: PRON},
"APP": {POS: ADJ},
"CD-": {POS: NUM},
"VPA": {POS: VERB},
"VPP": {POS: VERB},
}