# encoding: utf8
from __future__ import unicode_literals, print_function

import re
from collections import namedtuple

from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from .tag_map import TAG_MAP
from .tag_orth_map import TAG_ORTH_MAP
from .tag_bigram_map import TAG_BIGRAM_MAP
from ...attrs import LANG
from ...compat import copy_reg
from ...language import Language
from ...symbols import POS
from ...tokens import Doc
from ...util import DummyTokenizer, get_words_and_spaces

# Hold the attributes we need with convenient names
DetailedToken = namedtuple("DetailedToken", ["surface", "pos", "lemma"])

# Handling for multiple spaces in a row is somewhat awkward, this simplifies
# the flow by creating a dummy with the same interface.
DummyNode = namedtuple("DummyNode", ["surface", "pos", "lemma"])
DummySpace = DummyNode(" ", " ", " ")


def try_sudachi_import():
    """SudachiPy is required for Japanese support, so check for it.
    It it's not available blow up and explain how to fix it."""
    try:
        from sudachipy import dictionary, tokenizer

        tok = dictionary.Dictionary().create(
            mode=tokenizer.Tokenizer.SplitMode.A
        )
        return tok
    except ImportError:
        raise ImportError(
            "Japanese support requires SudachiPy: " "https://github.com/WorksApplications/SudachiPy"
        )


def resolve_pos(token, next_token):
    """If necessary, add a field to the POS tag for UD mapping.
    Under Universal Dependencies, sometimes the same Unidic POS tag can
    be mapped differently depending on the literal token or its context
    in the sentence. This function returns resolved POSs for both token
    and next_token by tuple.
    """

    # Some tokens have their UD tag decided based on the POS of the following
    # token.

    # orth based rules
    if token.pos in TAG_ORTH_MAP:
        orth_map = TAG_ORTH_MAP[token.pos[0]]
        if token.surface in orth_map:
            return orth_map[token.surface], None

    # tag bi-gram mapping
    if next_token:
        tag_bigram = token.pos[0], next_token.pos[0]
        if tag_bigram in TAG_BIGRAM_MAP:
            bipos = TAG_BIGRAM_MAP[tag_bigram]
            if bipos[0] is None:
                return TAG_MAP[token.pos[0]][POS], bipos[1]
            else:
                return bipos

    return TAG_MAP[token.pos[0]][POS], None


# Use a mapping of paired punctuation to avoid splitting quoted sentences.
pairpunct = {'「':'」', '『': '』', '【': '】'}


def separate_sentences(doc):
    """Given a doc, mark tokens that start sentences based on Unidic tags.
    """

    stack = [] # save paired punctuation

    for i, token in enumerate(doc[:-2]):
        # Set all tokens after the first to false by default. This is necessary
        # for the doc code to be aware we've done sentencization, see
        # `is_sentenced`.
        token.sent_start = (i == 0)
        if token.tag_:
            if token.tag_ == "補助記号-括弧開":
                ts = str(token)
                if ts in pairpunct:
                    stack.append(pairpunct[ts])
                elif stack and ts == stack[-1]:
                    stack.pop()

            if token.tag_ == "補助記号-句点":
                next_token = doc[i+1]
                if next_token.tag_ != token.tag_ and not stack:
                    next_token.sent_start = True


def get_dtokens(tokenizer, text):
    tokens = tokenizer.tokenize(text)
    words = []
    for ti, token in enumerate(tokens):
        tag = '-'.join([xx for xx in token.part_of_speech()[:4] if xx != '*'])
        inf = '-'.join([xx for xx in token.part_of_speech()[4:] if xx != '*'])
        dtoken = DetailedToken(
                token.surface(),
                (tag, inf),
                token.dictionary_form())
        if ti > 0 and words[-1].pos[0] == '空白' and tag == '空白':
            # don't add multiple space tokens in a row
            continue
        words.append(dtoken)

    # remove empty tokens. These can be produced with characters like … that
    # Sudachi normalizes internally. 
    words = [ww for ww in words if len(ww.surface) > 0]
    return words

class JapaneseTokenizer(DummyTokenizer):
    def __init__(self, cls, nlp=None):
        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
        self.tokenizer = try_sudachi_import()

    def __call__(self, text):
        dtokens = get_dtokens(self.tokenizer, text)

        words = [x.surface for x in dtokens]
        words, spaces = get_words_and_spaces(words, text)
        unidic_tags = [",".join(x.pos) for x in dtokens]
        doc = Doc(self.vocab, words=words, spaces=spaces)
        next_pos = None
        for ii, (token, dtoken) in enumerate(zip(doc, dtokens)):
            ntoken = dtokens[ii+1] if ii+1 < len(dtokens) else None
            token.tag_ = dtoken.pos[0]
            if next_pos:
                token.pos = next_pos
                next_pos = None
            else:
                token.pos, next_pos = resolve_pos(dtoken, ntoken)

            # if there's no lemma info (it's an unk) just use the surface
            token.lemma_ = dtoken.lemma
        doc.user_data["unidic_tags"] = unidic_tags

        separate_sentences(doc)
        return doc


class JapaneseDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda _text: "ja"
    stop_words = STOP_WORDS
    tag_map = TAG_MAP
    syntax_iterators = SYNTAX_ITERATORS
    writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}

    @classmethod
    def create_tokenizer(cls, nlp=None):
        return JapaneseTokenizer(cls, nlp)


class Japanese(Language):
    lang = "ja"
    Defaults = JapaneseDefaults

    def make_doc(self, text):
        return self.tokenizer(text)


def pickle_japanese(instance):
    return Japanese, tuple()


copy_reg.pickle(Japanese, pickle_japanese)

__all__ = ["Japanese"]