Optimizing unidecode in WordTokenizer

Related to #30
This commit is contained in:
Yomguithereal 2021-05-27 17:15:52 +02:00
parent fcc0bd4acf
commit de662d5cb8
1 changed files with 5 additions and 6 deletions

View File

@ -25,7 +25,7 @@
import re
from html import unescape
from emoji import get_emoji_regexp
from unidecode import unidecode
from unidecode import unidecode_expect_ascii
from ebbe import with_next
from typing import Optional, Iterable
@ -294,7 +294,6 @@ class WordTokenizer(object):
if (
self.lower or
self.unidecode or
self.reduce_words or
self.normalize_mentions or
self.normalize_hashtags or
@ -605,6 +604,9 @@ class WordTokenizer(object):
if self.decode_html_entities:
string = unescape(string)
if self.unidecode:
string = unidecode_expect_ascii(string)
if self.__only_defaults:
yield from self.__tokenize(string)
return
@ -633,7 +635,7 @@ class WordTokenizer(object):
elif token_type == 'hashtag':
if self.normalize_hashtags:
token_value = unidecode(token_value.lower())
token_value = unidecode_expect_ascii(token_value.lower())
token_changed = True
if self.hashtags_as_words:
@ -645,9 +647,6 @@ class WordTokenizer(object):
if self.lower:
token_value = token_value.lower()
if self.unidecode:
token_value = unidecode(token_value)
if self.reduce_words:
token_value = reduce_lenghtening(token_value)