From 6cdb7b2e0416da26d87b0aa57db700817e1ad763 Mon Sep 17 00:00:00 2001 From: jeannefukumaru Date: Mon, 1 Apr 2019 18:27:48 +0800 Subject: [PATCH] added tag_map for indonesian (#3515) * added tag_map for indonesian * changed tag map from .py to .txt to see if tests pass * added symbols import * added utf8 encoding flag * added missing SCONJ symbol * Auto-format * Remove unused imports * Make tag map available in Indonesian defaults --- spacy/lang/id/__init__.py | 2 ++ spacy/lang/id/tag_map.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 spacy/lang/id/tag_map.py diff --git a/spacy/lang/id/__init__.py b/spacy/lang/id/__init__.py index d3c47d4b4..08e2d8ec2 100644 --- a/spacy/lang/id/__init__.py +++ b/spacy/lang/id/__init__.py @@ -8,6 +8,7 @@ from .norm_exceptions import NORM_EXCEPTIONS from .lemmatizer import LOOKUP from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS +from .tag_map import TAG_MAP from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS @@ -30,6 +31,7 @@ class IndonesianDefaults(Language.Defaults): infixes = TOKENIZER_INFIXES syntax_iterators = SYNTAX_ITERATORS lemma_lookup = LOOKUP + tag_map = TAG_MAP class Indonesian(Language): diff --git a/spacy/lang/id/tag_map.py b/spacy/lang/id/tag_map.py new file mode 100644 index 000000000..71d105bf4 --- /dev/null +++ b/spacy/lang/id/tag_map.py @@ -0,0 +1,34 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...symbols import POS, PUNCT, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB +from ...symbols import NOUN, PRON, AUX, SCONJ + + +# POS explanations for indonesian available from https://www.aclweb.org/anthology/Y12-1014 +TAG_MAP = { + "NSD": {POS: NOUN}, + "Z–": {POS: PUNCT}, + "VSA": {POS: VERB}, + "CC-": {POS: NUM}, + "R–": {POS: ADP}, + "D–": {POS: ADV}, + "ASP": {POS: ADJ}, + "S–": {POS: SCONJ}, + "VSP": {POS: VERB}, + "H–": {POS: CCONJ}, + "F–": {POS: X}, + "B–": {POS: DET}, + "CO-": {POS: NUM}, + "G–": {POS: ADV}, + "PS3": {POS: PRON}, + "W–": {POS: ADV}, + "O–": {POS: AUX}, + "PP1": {POS: PRON}, + "ASS": {POS: ADJ}, + "PS1": {POS: PRON}, + "APP": {POS: ADJ}, + "CD-": {POS: NUM}, + "VPA": {POS: VERB}, + "VPP": {POS: VERB}, +}