mirror of https://github.com/explosion/spaCy.git
Add basic japanese support
This commit is contained in:
parent
f26a3b5a50
commit
c8f83aeb87
3
setup.py
3
setup.py
|
@ -36,7 +36,8 @@ PACKAGES = [
|
||||||
'spacy.fi',
|
'spacy.fi',
|
||||||
'spacy.bn',
|
'spacy.bn',
|
||||||
'spacy.he',
|
'spacy.he',
|
||||||
'spacy.nb',
|
'spacy.nb',
|
||||||
|
'spacy.ja',
|
||||||
'spacy.en.lemmatizer',
|
'spacy.en.lemmatizer',
|
||||||
'spacy.cli.converters',
|
'spacy.cli.converters',
|
||||||
'spacy.language_data',
|
'spacy.language_data',
|
||||||
|
|
|
@ -5,12 +5,12 @@ from . import util
|
||||||
from .deprecated import resolve_model_name
|
from .deprecated import resolve_model_name
|
||||||
from .cli.info import info
|
from .cli.info import info
|
||||||
|
|
||||||
from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb
|
from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja
|
||||||
|
|
||||||
|
|
||||||
_languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French,
|
_languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French,
|
||||||
it.Italian, hu.Hungarian, zh.Chinese, nl.Dutch, sv.Swedish,
|
it.Italian, hu.Hungarian, zh.Chinese, nl.Dutch, sv.Swedish,
|
||||||
fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian)
|
fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian, ja.Japanese)
|
||||||
|
|
||||||
|
|
||||||
for _lang in _languages:
|
for _lang in _languages:
|
||||||
|
|
|
@ -0,0 +1,19 @@
|
||||||
|
# encoding: utf8
|
||||||
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
|
from os import path
|
||||||
|
|
||||||
|
from ..language import Language
|
||||||
|
from ..attrs import LANG
|
||||||
|
from ..tokens import Doc
|
||||||
|
|
||||||
|
from .language_data import *
|
||||||
|
|
||||||
|
|
||||||
|
class Japanese(Language):
|
||||||
|
lang = 'ja'
|
||||||
|
|
||||||
|
def make_doc(self, text):
|
||||||
|
from janome.tokenizer import Tokenizer
|
||||||
|
words = [x.surface for x in Tokenizer().tokenize(text)]
|
||||||
|
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
|
@ -0,0 +1,23 @@
|
||||||
|
# encoding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
# import base language data
|
||||||
|
from .. import language_data as base
|
||||||
|
|
||||||
|
|
||||||
|
# import util functions
|
||||||
|
from ..language_data import update_exc, strings_to_exc
|
||||||
|
|
||||||
|
|
||||||
|
# import language-specific data from files
|
||||||
|
from .tag_map import TAG_MAP
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
TAG_MAP = dict(TAG_MAP)
|
||||||
|
STOP_WORDS = set(STOP_WORDS)
|
||||||
|
|
||||||
|
|
||||||
|
# export
|
||||||
|
__all__ = ["TAG_MAP", "STOP_WORDS"]
|
|
@ -0,0 +1,9 @@
|
||||||
|
# encoding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
# stop words as whitespace-separated list
|
||||||
|
STOP_WORDS = set("""
|
||||||
|
。
|
||||||
|
、
|
||||||
|
""".split())
|
|
@ -0,0 +1,24 @@
|
||||||
|
# encoding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ..symbols import *
|
||||||
|
|
||||||
|
|
||||||
|
TAG_MAP = {
|
||||||
|
"ADV": {POS: ADV},
|
||||||
|
"NOUN": {POS: NOUN},
|
||||||
|
"ADP": {POS: ADP},
|
||||||
|
"PRON": {POS: PRON},
|
||||||
|
"SCONJ": {POS: SCONJ},
|
||||||
|
"PROPN": {POS: PROPN},
|
||||||
|
"DET": {POS: DET},
|
||||||
|
"SYM": {POS: SYM},
|
||||||
|
"INTJ": {POS: INTJ},
|
||||||
|
"PUNCT": {POS: PUNCT},
|
||||||
|
"NUM": {POS: NUM},
|
||||||
|
"AUX": {POS: AUX},
|
||||||
|
"X": {POS: X},
|
||||||
|
"CONJ": {POS: CONJ},
|
||||||
|
"ADJ": {POS: ADJ},
|
||||||
|
"VERB": {POS: VERB}
|
||||||
|
}
|
Loading…
Reference in New Issue