spaCy/spacy/lang/ja/__init__.py

# encoding: utf8
from __future__ import unicode_literals, print_function

from ...language import Language
from ...attrs import LANG
from ...tokens import Doc, Token
from ...tokenizer import Tokenizer
from ... import util
from .tag_map import TAG_MAP

import re
from collections import namedtuple

ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'lemma', 'pos'])

def try_mecab_import():
    """Mecab is required for Japanese support, so check for it.

    It it's not available blow up and explain how to fix it."""
    try:
        import MeCab
        # XXX Is this the right place for this?
        Token.set_extension('mecab_tag', default=None)
        return MeCab
    except ImportError:
        raise ImportError("Japanese support requires MeCab: "
                          "https://github.com/SamuraiT/mecab-python3")

def resolve_pos(token):
    """If necessary, add a field to the POS tag for UD mapping.

    Under Universal Dependencies, sometimes the same Unidic POS tag can
    be mapped differently depending on the literal token or its context
    in the sentence. This function adds information to the POS tag to 
    resolve ambiguous mappings.
    """

    # NOTE: This is a first take. The rules here are crude approximations.
    # For many of these, full dependencies are needed to properly resolve
    # PoS mappings.

    if token.pos == '連体詞,*,*,*':
        if re.match('^[こそあど此其彼]の', token.surface):
            return token.pos + ',DET'
        if re.match('^[こそあど此其彼]', token.surface):
            return token.pos + ',PRON'
        else:
            return token.pos + ',ADJ'
    return token.pos

def detailed_tokens(tokenizer, text):
    """Format Mecab output into a nice data structure, based on Janome."""
    tokenizer.parse(text)
    node = tokenizer.parseToNode(text)
    node = node.next # first node is beginning of sentence and empty, skip it
    words = []
    while node.posid != 0:
        surface = node.surface
        base = surface # a default value. Updated if available later.
        parts = node.feature.split(',')
        pos = ','.join(parts[0:4])

        if len(parts) > 7:
            # this information is only available for words in the tokenizer dictionary
            base = parts[7]

        words.append( ShortUnitWord(surface, base, pos) )
        node = node.next
    return words

class JapaneseTokenizer(object):
    def __init__(self, cls, nlp=None):
        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)

        MeCab = try_mecab_import()
        self.tokenizer = MeCab.Tagger()

    def __call__(self, text):
        dtokens = detailed_tokens(self.tokenizer, text)
        words = [x.surface for x in dtokens]
        doc = Doc(self.vocab, words=words, spaces=[False]*len(words))
        for token, dtoken in zip(doc, dtokens):
            token._.mecab_tag = dtoken.pos
            token.tag_ = resolve_pos(dtoken)
            token.lemma_ = dtoken.lemma
        return doc

    # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
    # allow serialization (see #1557)
    def to_bytes(self, **exclude):
        return b''

    def from_bytes(self, bytes_data, **exclude):
        return self

    def to_disk(self, path, **exclude):
        return None

    def from_disk(self, path, **exclude):
        return self

class JapaneseCharacterSegmenter(object):
    def __init__(self, vocab):
        self.vocab = vocab
        self._presegmenter = self._make_presegmenter(self.vocab)

    def _make_presegmenter(self, vocab):
        rules = Japanese.Defaults.tokenizer_exceptions
        token_match = Japanese.Defaults.token_match
        prefix_search = (util.compile_prefix_regex(Japanese.Defaults.prefixes).search
                         if Japanese.Defaults.prefixes else None)
        suffix_search = (util.compile_suffix_regex(Japanese.Defaults.suffixes).search
                         if Japanese.Defaults.suffixes else None)
        infix_finditer = (util.compile_infix_regex(Japanese.Defaults.infixes).finditer
                          if Japanese.Defaults.infixes else None)
        return Tokenizer(vocab, rules=rules,
                         prefix_search=prefix_search,
                         suffix_search=suffix_search,
                         infix_finditer=infix_finditer,
                         token_match=token_match)

    def __call__(self, text):
        words = []
        spaces = []
        doc = self._presegmenter(text)
        for token in doc:
            words.extend(list(token.text))
            spaces.extend([False]*len(token.text))
            spaces[-1] = bool(token.whitespace_)
        return Doc(self.vocab, words=words, spaces=spaces)


class JapaneseDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'ja'
    tag_map = TAG_MAP
    use_janome = True

    @classmethod
    def create_tokenizer(cls, nlp=None):
        if cls.use_janome:
            return JapaneseTokenizer(cls, nlp)
        else:
            return JapaneseCharacterSegmenter(nlp.vocab)

class Japanese(Language):
    lang = 'ja'
    Defaults = JapaneseDefaults
    Tokenizer = JapaneseTokenizer

    def make_doc(self, text):
        return self.tokenizer(text)

__all__ = ['Japanese']
Add basic japanese support 2017-05-03 04:56:21 +00:00			`# encoding: utf8`
			`from __future__ import unicode_literals, print_function`

Fix relative imports 2017-05-08 20:29:04 +00:00			`from ...language import Language`
			`from ...attrs import LANG`
Port Japanese mecab tokenizer from v1 (#2036) * Port Japanese mecab tokenizer from v1 This brings the Mecab-based Japanese tokenization introduced in #1246 to spaCy v2. There isn't a JapaneseTagger implementation yet, but POS tag information from Mecab is stored in a token extension. A tag map is also included. As a reminder, Mecab is required because Universal Dependencies are based on Unidic tags, and Janome doesn't support Unidic. Things to check: 1. Is this the right way to use a token extension? 2. What's the right way to implement a JapaneseTagger? The approach in #1246 relied on `tag_from_strings` which is just gone now. I guess the best thing is to just try training spaCy's default Tagger? -POLM * Add tagging/make_doc and tests 2018-05-03 16:38:26 +00:00			`from ...tokens import Doc, Token`
Port over changes from #1157 2017-10-14 11:11:39 +00:00			`from ...tokenizer import Tokenizer`
Fix character-based tokenization for Japanese 2018-08-26 23:51:38 +00:00			`from ... import util`
Port Japanese mecab tokenizer from v1 (#2036) * Port Japanese mecab tokenizer from v1 This brings the Mecab-based Japanese tokenization introduced in #1246 to spaCy v2. There isn't a JapaneseTagger implementation yet, but POS tag information from Mecab is stored in a token extension. A tag map is also included. As a reminder, Mecab is required because Universal Dependencies are based on Unidic tags, and Janome doesn't support Unidic. Things to check: 1. Is this the right way to use a token extension? 2. What's the right way to implement a JapaneseTagger? The approach in #1246 relied on `tag_from_strings` which is just gone now. I guess the best thing is to just try training spaCy's default Tagger? -POLM * Add tagging/make_doc and tests 2018-05-03 16:38:26 +00:00			`from .tag_map import TAG_MAP`
Port over changes from #1157 2017-10-14 11:11:39 +00:00
Port Japanese mecab tokenizer from v1 (#2036) * Port Japanese mecab tokenizer from v1 This brings the Mecab-based Japanese tokenization introduced in #1246 to spaCy v2. There isn't a JapaneseTagger implementation yet, but POS tag information from Mecab is stored in a token extension. A tag map is also included. As a reminder, Mecab is required because Universal Dependencies are based on Unidic tags, and Janome doesn't support Unidic. Things to check: 1. Is this the right way to use a token extension? 2. What's the right way to implement a JapaneseTagger? The approach in #1246 relied on `tag_from_strings` which is just gone now. I guess the best thing is to just try training spaCy's default Tagger? -POLM * Add tagging/make_doc and tests 2018-05-03 16:38:26 +00:00			`import re`
			`from collections import namedtuple`

			`ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'lemma', 'pos'])`

			`def try_mecab_import():`
			`"""Mecab is required for Japanese support, so check for it.`

			`It it's not available blow up and explain how to fix it."""`
			`try:`
			`import MeCab`
Fix character-based tokenization for Japanese 2018-08-26 23:51:38 +00:00			`# XXX Is this the right place for this?`
			`Token.set_extension('mecab_tag', default=None)`
Port Japanese mecab tokenizer from v1 (#2036) * Port Japanese mecab tokenizer from v1 This brings the Mecab-based Japanese tokenization introduced in #1246 to spaCy v2. There isn't a JapaneseTagger implementation yet, but POS tag information from Mecab is stored in a token extension. A tag map is also included. As a reminder, Mecab is required because Universal Dependencies are based on Unidic tags, and Janome doesn't support Unidic. Things to check: 1. Is this the right way to use a token extension? 2. What's the right way to implement a JapaneseTagger? The approach in #1246 relied on `tag_from_strings` which is just gone now. I guess the best thing is to just try training spaCy's default Tagger? -POLM * Add tagging/make_doc and tests 2018-05-03 16:38:26 +00:00			`return MeCab`
			`except ImportError:`
			`raise ImportError("Japanese support requires MeCab: "`
			`"https://github.com/SamuraiT/mecab-python3")`

			`def resolve_pos(token):`
			`"""If necessary, add a field to the POS tag for UD mapping.`

			`Under Universal Dependencies, sometimes the same Unidic POS tag can`
			`be mapped differently depending on the literal token or its context`
			`in the sentence. This function adds information to the POS tag to`
			`resolve ambiguous mappings.`
			`"""`

			`# NOTE: This is a first take. The rules here are crude approximations.`
			`# For many of these, full dependencies are needed to properly resolve`
			`# PoS mappings.`

			`if token.pos == '連体詞,,,*':`
			`if re.match('^[こそあど此其彼]の', token.surface):`
			`return token.pos + ',DET'`
			`if re.match('^[こそあど此其彼]', token.surface):`
			`return token.pos + ',PRON'`
			`else:`
			`return token.pos + ',ADJ'`
			`return token.pos`

			`def detailed_tokens(tokenizer, text):`
			`"""Format Mecab output into a nice data structure, based on Janome."""`
Fix character-based tokenization for Japanese 2018-08-26 23:51:38 +00:00			`tokenizer.parse(text)`
Port Japanese mecab tokenizer from v1 (#2036) * Port Japanese mecab tokenizer from v1 This brings the Mecab-based Japanese tokenization introduced in #1246 to spaCy v2. There isn't a JapaneseTagger implementation yet, but POS tag information from Mecab is stored in a token extension. A tag map is also included. As a reminder, Mecab is required because Universal Dependencies are based on Unidic tags, and Janome doesn't support Unidic. Things to check: 1. Is this the right way to use a token extension? 2. What's the right way to implement a JapaneseTagger? The approach in #1246 relied on `tag_from_strings` which is just gone now. I guess the best thing is to just try training spaCy's default Tagger? -POLM * Add tagging/make_doc and tests 2018-05-03 16:38:26 +00:00			`node = tokenizer.parseToNode(text)`
			`node = node.next # first node is beginning of sentence and empty, skip it`
			`words = []`
			`while node.posid != 0:`
			`surface = node.surface`
			`base = surface # a default value. Updated if available later.`
			`parts = node.feature.split(',')`
			`pos = ','.join(parts[0:4])`

fix wrong indexing (#2416) * fix wrong indexing * add agreement 2018-06-19 08:20:57 +00:00			`if len(parts) > 7:`
Port Japanese mecab tokenizer from v1 (#2036) * Port Japanese mecab tokenizer from v1 This brings the Mecab-based Japanese tokenization introduced in #1246 to spaCy v2. There isn't a JapaneseTagger implementation yet, but POS tag information from Mecab is stored in a token extension. A tag map is also included. As a reminder, Mecab is required because Universal Dependencies are based on Unidic tags, and Janome doesn't support Unidic. Things to check: 1. Is this the right way to use a token extension? 2. What's the right way to implement a JapaneseTagger? The approach in #1246 relied on `tag_from_strings` which is just gone now. I guess the best thing is to just try training spaCy's default Tagger? -POLM * Add tagging/make_doc and tests 2018-05-03 16:38:26 +00:00			`# this information is only available for words in the tokenizer dictionary`
			`base = parts[7]`

			`words.append( ShortUnitWord(surface, base, pos) )`
			`node = node.next`
			`return words`
Port over changes from #1157 2017-10-14 11:11:39 +00:00
			`class JapaneseTokenizer(object):`
			`def __init__(self, cls, nlp=None):`
			`self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)`
Port Japanese mecab tokenizer from v1 (#2036) * Port Japanese mecab tokenizer from v1 This brings the Mecab-based Japanese tokenization introduced in #1246 to spaCy v2. There isn't a JapaneseTagger implementation yet, but POS tag information from Mecab is stored in a token extension. A tag map is also included. As a reminder, Mecab is required because Universal Dependencies are based on Unidic tags, and Janome doesn't support Unidic. Things to check: 1. Is this the right way to use a token extension? 2. What's the right way to implement a JapaneseTagger? The approach in #1246 relied on `tag_from_strings` which is just gone now. I guess the best thing is to just try training spaCy's default Tagger? -POLM * Add tagging/make_doc and tests 2018-05-03 16:38:26 +00:00
			`MeCab = try_mecab_import()`
			`self.tokenizer = MeCab.Tagger()`
Port over changes from #1157 2017-10-14 11:11:39 +00:00
			`def __call__(self, text):`
Port Japanese mecab tokenizer from v1 (#2036) * Port Japanese mecab tokenizer from v1 This brings the Mecab-based Japanese tokenization introduced in #1246 to spaCy v2. There isn't a JapaneseTagger implementation yet, but POS tag information from Mecab is stored in a token extension. A tag map is also included. As a reminder, Mecab is required because Universal Dependencies are based on Unidic tags, and Janome doesn't support Unidic. Things to check: 1. Is this the right way to use a token extension? 2. What's the right way to implement a JapaneseTagger? The approach in #1246 relied on `tag_from_strings` which is just gone now. I guess the best thing is to just try training spaCy's default Tagger? -POLM * Add tagging/make_doc and tests 2018-05-03 16:38:26 +00:00			`dtokens = detailed_tokens(self.tokenizer, text)`
			`words = [x.surface for x in dtokens]`
			`doc = Doc(self.vocab, words=words, spaces=[False]*len(words))`
			`for token, dtoken in zip(doc, dtokens):`
			`token._.mecab_tag = dtoken.pos`
			`token.tag_ = resolve_pos(dtoken)`
Add Japanese lemmas (#2543) This info was already available from Mecab, forgot to add it before. 2018-07-13 08:55:14 +00:00			`token.lemma_ = dtoken.lemma`
Port Japanese mecab tokenizer from v1 (#2036) * Port Japanese mecab tokenizer from v1 This brings the Mecab-based Japanese tokenization introduced in #1246 to spaCy v2. There isn't a JapaneseTagger implementation yet, but POS tag information from Mecab is stored in a token extension. A tag map is also included. As a reminder, Mecab is required because Universal Dependencies are based on Unidic tags, and Janome doesn't support Unidic. Things to check: 1. Is this the right way to use a token extension? 2. What's the right way to implement a JapaneseTagger? The approach in #1246 relied on `tag_from_strings` which is just gone now. I guess the best thing is to just try training spaCy's default Tagger? -POLM * Add tagging/make_doc and tests 2018-05-03 16:38:26 +00:00			`return doc`
Port over changes from #1157 2017-10-14 11:11:39 +00:00
Add dummy serialization methods for Japanese and missing lang getter (resolves #1557) 2017-11-15 11:44:02 +00:00			`# add dummy methods for to_bytes, from_bytes, to_disk and from_disk to`
			`# allow serialization (see #1557)`
			`def to_bytes(self, **exclude):`
			`return b''`

			`def from_bytes(self, bytes_data, **exclude):`
			`return self`

			`def to_disk(self, path, **exclude):`
			`return None`

			`def from_disk(self, path, **exclude):`
			`return self`

Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop" This reverts commit c9ba3d3c2dc7067cf8bd55f878cec45a8c6d73d4, reversing changes made to 92c26a35d425d4e8ca1b805ea776ea10f5ded3df. 2018-03-27 17:23:02 +00:00			`class JapaneseCharacterSegmenter(object):`
			`def __init__(self, vocab):`
			`self.vocab = vocab`
Fix character-based tokenization for Japanese 2018-08-26 23:51:38 +00:00			`self._presegmenter = self._make_presegmenter(self.vocab)`

			`def _make_presegmenter(self, vocab):`
			`rules = Japanese.Defaults.tokenizer_exceptions`
			`token_match = Japanese.Defaults.token_match`
			`prefix_search = (util.compile_prefix_regex(Japanese.Defaults.prefixes).search`
			`if Japanese.Defaults.prefixes else None)`
			`suffix_search = (util.compile_suffix_regex(Japanese.Defaults.suffixes).search`
			`if Japanese.Defaults.suffixes else None)`
			`infix_finditer = (util.compile_infix_regex(Japanese.Defaults.infixes).finditer`
			`if Japanese.Defaults.infixes else None)`
			`return Tokenizer(vocab, rules=rules,`
			`prefix_search=prefix_search,`
			`suffix_search=suffix_search,`
			`infix_finditer=infix_finditer,`
			`token_match=token_match)`
Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop" This reverts commit c9ba3d3c2dc7067cf8bd55f878cec45a8c6d73d4, reversing changes made to 92c26a35d425d4e8ca1b805ea776ea10f5ded3df. 2018-03-27 17:23:02 +00:00
			`def __call__(self, text):`
			`words = []`
			`spaces = []`
Fix character-based tokenization for Japanese 2018-08-26 23:51:38 +00:00			`doc = self._presegmenter(text)`
			`for token in doc:`
Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop" This reverts commit c9ba3d3c2dc7067cf8bd55f878cec45a8c6d73d4, reversing changes made to 92c26a35d425d4e8ca1b805ea776ea10f5ded3df. 2018-03-27 17:23:02 +00:00			`words.extend(list(token.text))`
			`spaces.extend([False]*len(token.text))`
			`spaces[-1] = bool(token.whitespace_)`
			`return Doc(self.vocab, words=words, spaces=spaces)`

Port over changes from #1157 2017-10-14 11:11:39 +00:00
			`class JapaneseDefaults(Language.Defaults):`
Add dummy serialization methods for Japanese and missing lang getter (resolves #1557) 2017-11-15 11:44:02 +00:00			`lex_attr_getters = dict(Language.Defaults.lex_attr_getters)`
			`lex_attr_getters[LANG] = lambda text: 'ja'`
Port Japanese mecab tokenizer from v1 (#2036) * Port Japanese mecab tokenizer from v1 This brings the Mecab-based Japanese tokenization introduced in #1246 to spaCy v2. There isn't a JapaneseTagger implementation yet, but POS tag information from Mecab is stored in a token extension. A tag map is also included. As a reminder, Mecab is required because Universal Dependencies are based on Unidic tags, and Janome doesn't support Unidic. Things to check: 1. Is this the right way to use a token extension? 2. What's the right way to implement a JapaneseTagger? The approach in #1246 relied on `tag_from_strings` which is just gone now. I guess the best thing is to just try training spaCy's default Tagger? -POLM * Add tagging/make_doc and tests 2018-05-03 16:38:26 +00:00			`tag_map = TAG_MAP`
Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop" This reverts commit c9ba3d3c2dc7067cf8bd55f878cec45a8c6d73d4, reversing changes made to 92c26a35d425d4e8ca1b805ea776ea10f5ded3df. 2018-03-27 17:23:02 +00:00			`use_janome = True`
Add dummy serialization methods for Japanese and missing lang getter (resolves #1557) 2017-11-15 11:44:02 +00:00
Port over changes from #1157 2017-10-14 11:11:39 +00:00			`@classmethod`
			`def create_tokenizer(cls, nlp=None):`
Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop" This reverts commit c9ba3d3c2dc7067cf8bd55f878cec45a8c6d73d4, reversing changes made to 92c26a35d425d4e8ca1b805ea776ea10f5ded3df. 2018-03-27 17:23:02 +00:00			`if cls.use_janome:`
			`return JapaneseTokenizer(cls, nlp)`
			`else:`
Fix character-based tokenization for Japanese 2018-08-26 23:51:38 +00:00			`return JapaneseCharacterSegmenter(nlp.vocab)`
Add basic japanese support 2017-05-03 04:56:21 +00:00
			`class Japanese(Language):`
			`lang = 'ja'`
Port over changes from #1157 2017-10-14 11:11:39 +00:00			`Defaults = JapaneseDefaults`
Port Japanese mecab tokenizer from v1 (#2036) * Port Japanese mecab tokenizer from v1 This brings the Mecab-based Japanese tokenization introduced in #1246 to spaCy v2. There isn't a JapaneseTagger implementation yet, but POS tag information from Mecab is stored in a token extension. A tag map is also included. As a reminder, Mecab is required because Universal Dependencies are based on Unidic tags, and Janome doesn't support Unidic. Things to check: 1. Is this the right way to use a token extension? 2. What's the right way to implement a JapaneseTagger? The approach in #1246 relied on `tag_from_strings` which is just gone now. I guess the best thing is to just try training spaCy's default Tagger? -POLM * Add tagging/make_doc and tests 2018-05-03 16:38:26 +00:00			`Tokenizer = JapaneseTokenizer`
Add basic japanese support 2017-05-03 04:56:21 +00:00
			`def make_doc(self, text):`
Fix Japanese tokenizer JapaneseTokenizer now returns a Doc, not individual words 2017-10-24 11:02:19 +00:00			`return self.tokenizer(text)`
adding export japanese 2017-05-03 09:07:29 +00:00
Reorganise Japanese language data 2017-05-08 13:50:46 +00:00			`__all__ = ['Japanese']`