From 7b064542f7d0b7b2e0c7441ce37037a0e9b4fd1e Mon Sep 17 00:00:00 2001 From: Kirill Bulygin Date: Thu, 10 Jan 2019 19:40:37 +0500 Subject: [PATCH] Making `lang/th/test_tokenizer.py` pass by creating `ThaiTokenizer` (#3078) --- spacy/lang/ja/__init__.py | 64 +++++++++++++++++++-------------------- spacy/lang/th/__init__.py | 41 +++++++++++++++++-------- spacy/util.py | 16 ++++++++++ 3 files changed, 75 insertions(+), 46 deletions(-) diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 05b1198d8..9fa7e6678 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -1,20 +1,22 @@ # encoding: utf8 from __future__ import unicode_literals, print_function -from ...language import Language -from ...attrs import LANG -from ...tokens import Doc, Token -from ...tokenizer import Tokenizer -from .tag_map import TAG_MAP - import re from collections import namedtuple +from .tag_map import TAG_MAP + +from ...attrs import LANG +from ...language import Language +from ...tokens import Doc, Token +from ...util import DummyTokenizer + ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'lemma', 'pos']) # XXX Is this the right place for this? Token.set_extension('mecab_tag', default=None) + def try_mecab_import(): """Mecab is required for Japanese support, so check for it. @@ -26,6 +28,7 @@ def try_mecab_import(): raise ImportError("Japanese support requires MeCab: " "https://github.com/SamuraiT/mecab-python3") + def resolve_pos(token): """If necessary, add a field to the POS tag for UD mapping. @@ -40,81 +43,76 @@ def resolve_pos(token): # PoS mappings. if token.pos == '連体詞,*,*,*': - if re.match('^[こそあど此其彼]の', token.surface): + if re.match(r'[こそあど此其彼]の', token.surface): return token.pos + ',DET' - if re.match('^[こそあど此其彼]', token.surface): + if re.match(r'[こそあど此其彼]', token.surface): return token.pos + ',PRON' - else: - return token.pos + ',ADJ' + return token.pos + ',ADJ' + return token.pos + def detailed_tokens(tokenizer, text): """Format Mecab output into a nice data structure, based on Janome.""" node = tokenizer.parseToNode(text) - node = node.next # first node is beginning of sentence and empty, skip it + node = node.next # first node is beginning of sentence and empty, skip it words = [] while node.posid != 0: surface = node.surface - base = surface # a default value. Updated if available later. + base = surface # a default value. Updated if available later. parts = node.feature.split(',') pos = ','.join(parts[0:4]) if len(parts) > 7: - # this information is only available for words in the tokenizer dictionary + # this information is only available for words in the tokenizer + # dictionary base = parts[7] - words.append( ShortUnitWord(surface, base, pos) ) + words.append(ShortUnitWord(surface, base, pos)) node = node.next return words -class JapaneseTokenizer(object): + +class JapaneseTokenizer(DummyTokenizer): def __init__(self, cls, nlp=None): self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) - MeCab = try_mecab_import() - self.tokenizer = MeCab.Tagger() + self.tokenizer = try_mecab_import().Tagger() self.tokenizer.parseToNode('') # see #2901 def __call__(self, text): dtokens = detailed_tokens(self.tokenizer, text) + words = [x.surface for x in dtokens] - doc = Doc(self.vocab, words=words, spaces=[False]*len(words)) + spaces = [False] * len(words) + doc = Doc(self.vocab, words=words, spaces=spaces) + for token, dtoken in zip(doc, dtokens): token._.mecab_tag = dtoken.pos token.tag_ = resolve_pos(dtoken) token.lemma_ = dtoken.lemma + return doc - # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to - # allow serialization (see #1557) - def to_bytes(self, **exclude): - return b'' - - def from_bytes(self, bytes_data, **exclude): - return self - - def to_disk(self, path, **exclude): - return None - - def from_disk(self, path, **exclude): - return self class JapaneseDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) - lex_attr_getters[LANG] = lambda text: 'ja' + lex_attr_getters[LANG] = lambda _text: 'ja' + tag_map = TAG_MAP @classmethod def create_tokenizer(cls, nlp=None): return JapaneseTokenizer(cls, nlp) + class Japanese(Language): lang = 'ja' Defaults = JapaneseDefaults - Tokenizer = JapaneseTokenizer def make_doc(self, text): return self.tokenizer(text) + __all__ = ['Japanese'] diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py index 0786bbdc4..1f6c9a954 100644 --- a/spacy/lang/th/__init__.py +++ b/spacy/lang/th/__init__.py @@ -5,34 +5,49 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tag_map import TAG_MAP from .stop_words import STOP_WORDS -from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...tokens import Doc -from ..norm_exceptions import BASE_NORMS +from ...attrs import LANG from ...language import Language -from ...attrs import LANG, NORM -from ...util import update_exc, add_lookups +from ...tokens import Doc +from ...util import DummyTokenizer + + +class ThaiTokenizer(DummyTokenizer): + def __init__(self, cls, nlp=None): + try: + from pythainlp.tokenize import word_tokenize + except ImportError: + raise ImportError( + "The Thai tokenizer requires the PyThaiNLP library: " + "https://github.com/PyThaiNLP/pythainlp") + + self.word_tokenize = word_tokenize + self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) + + def __call__(self, text): + words = list(self.word_tokenize(text, "newmm")) + spaces = [False] * len(words) + return Doc(self.vocab, words=words, spaces=spaces) class ThaiDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) - lex_attr_getters[LANG] = lambda text: 'th' + lex_attr_getters[LANG] = lambda _text: 'th' + tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS) tag_map = TAG_MAP stop_words = STOP_WORDS + @classmethod + def create_tokenizer(cls, nlp=None): + return ThaiTokenizer(cls, nlp) + class Thai(Language): lang = 'th' Defaults = ThaiDefaults def make_doc(self, text): - try: - from pythainlp.tokenize import word_tokenize - except ImportError: - raise ImportError("The Thai tokenizer requires the PyThaiNLP library: " - "https://github.com/PyThaiNLP/pythainlp") - words = [x for x in list(word_tokenize(text,"newmm"))] - return Doc(self.vocab, words=words, spaces=[False]*len(words)) + return self.tokenizer(text) __all__ = ['Thai'] diff --git a/spacy/util.py b/spacy/util.py index c66da759c..290d5eba5 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -635,3 +635,19 @@ class SimpleFrozenDict(dict): def update(self, other): raise NotImplementedError(Errors.E095) + + +class DummyTokenizer(object): + # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to + # allow serialization (see #1557) + def to_bytes(self, **exclude): + return b'' + + def from_bytes(self, _bytes_data, **exclude): + return self + + def to_disk(self, _path, **exclude): + return None + + def from_disk(self, _path, **exclude): + return self