From c8f83aeb873c2d3beff22cbe0f967b6d56b6793e Mon Sep 17 00:00:00 2001 From: Yasuaki Uechi Date: Wed, 3 May 2017 13:56:21 +0900 Subject: [PATCH] Add basic japanese support --- setup.py | 3 ++- spacy/__init__.py | 4 ++-- spacy/ja/__init__.py | 19 +++++++++++++++++++ spacy/ja/language_data.py | 23 +++++++++++++++++++++++ spacy/ja/stop_words.py | 9 +++++++++ spacy/ja/tag_map.py | 24 ++++++++++++++++++++++++ 6 files changed, 79 insertions(+), 3 deletions(-) create mode 100644 spacy/ja/__init__.py create mode 100644 spacy/ja/language_data.py create mode 100644 spacy/ja/stop_words.py create mode 100644 spacy/ja/tag_map.py diff --git a/setup.py b/setup.py index 1f13747dc..52ce06843 100755 --- a/setup.py +++ b/setup.py @@ -36,7 +36,8 @@ PACKAGES = [ 'spacy.fi', 'spacy.bn', 'spacy.he', - 'spacy.nb', + 'spacy.nb', + 'spacy.ja', 'spacy.en.lemmatizer', 'spacy.cli.converters', 'spacy.language_data', diff --git a/spacy/__init__.py b/spacy/__init__.py index f71d3addd..f5912e13e 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -5,12 +5,12 @@ from . import util from .deprecated import resolve_model_name from .cli.info import info -from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb +from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja _languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French, it.Italian, hu.Hungarian, zh.Chinese, nl.Dutch, sv.Swedish, - fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian) + fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian, ja.Japanese) for _lang in _languages: diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py new file mode 100644 index 000000000..f9ab7b560 --- /dev/null +++ b/spacy/ja/__init__.py @@ -0,0 +1,19 @@ +# encoding: utf8 +from __future__ import unicode_literals, print_function + +from os import path + +from ..language import Language +from ..attrs import LANG +from ..tokens import Doc + +from .language_data import * + + +class Japanese(Language): + lang = 'ja' + + def make_doc(self, text): + from janome.tokenizer import Tokenizer + words = [x.surface for x in Tokenizer().tokenize(text)] + return Doc(self.vocab, words=words, spaces=[False]*len(words)) diff --git a/spacy/ja/language_data.py b/spacy/ja/language_data.py new file mode 100644 index 000000000..2e8dfbafb --- /dev/null +++ b/spacy/ja/language_data.py @@ -0,0 +1,23 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +# import base language data +from .. import language_data as base + + +# import util functions +from ..language_data import update_exc, strings_to_exc + + +# import language-specific data from files +from .tag_map import TAG_MAP +from .stop_words import STOP_WORDS + + +TAG_MAP = dict(TAG_MAP) +STOP_WORDS = set(STOP_WORDS) + + +# export +__all__ = ["TAG_MAP", "STOP_WORDS"] \ No newline at end of file diff --git a/spacy/ja/stop_words.py b/spacy/ja/stop_words.py new file mode 100644 index 000000000..b2120b30d --- /dev/null +++ b/spacy/ja/stop_words.py @@ -0,0 +1,9 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +# stop words as whitespace-separated list +STOP_WORDS = set(""" +。 +、 +""".split()) \ No newline at end of file diff --git a/spacy/ja/tag_map.py b/spacy/ja/tag_map.py new file mode 100644 index 000000000..2196ff397 --- /dev/null +++ b/spacy/ja/tag_map.py @@ -0,0 +1,24 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..symbols import * + + +TAG_MAP = { + "ADV": {POS: ADV}, + "NOUN": {POS: NOUN}, + "ADP": {POS: ADP}, + "PRON": {POS: PRON}, + "SCONJ": {POS: SCONJ}, + "PROPN": {POS: PROPN}, + "DET": {POS: DET}, + "SYM": {POS: SYM}, + "INTJ": {POS: INTJ}, + "PUNCT": {POS: PUNCT}, + "NUM": {POS: NUM}, + "AUX": {POS: AUX}, + "X": {POS: X}, + "CONJ": {POS: CONJ}, + "ADJ": {POS: ADJ}, + "VERB": {POS: VERB} +} \ No newline at end of file