From 809176ffb67cc6b3c1137482c290b90f67945a6f Mon Sep 17 00:00:00 2001 From: Yomguithereal Date: Thu, 27 May 2021 17:29:49 +0200 Subject: [PATCH] Improving reduce lengthening regarding perf Fixes #30 --- fog/tokenizers/words.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fog/tokenizers/words.py b/fog/tokenizers/words.py index a5e948f..83d6712 100644 --- a/fog/tokenizers/words.py +++ b/fog/tokenizers/words.py @@ -40,7 +40,7 @@ EMAIL_LOOKAHEAD_RE = re.compile(r'^[A-Za-z0-9!#$%&*+\-/=?^_`{|}~]{1,64}@') SMILEY_RE = re.compile(r'^(?:[\-]+>|<[\-]+|[<>]?[:;=8][\-o\*\']?[\)\]\(\[dDpP/\:\}\{@\|\\]|[\)\]\(\[dDpP/\:\}\{@\|\\][\-o\*\']?[:;=8]|[<:]3|\^\^)') EMOJI_RE = get_emoji_regexp() POINT_SPLITTER_RE = re.compile(r'(\.)') -LENGTHENING_RE = re.compile(r'(.)\1{4,}') +LENGTHENING_RE = re.compile(r'([^\W\d])\1{4,}') ENGLISH_CONTRACTIONS = ['ll', 're', 'm', 's', 've', 'd'] ENGLISH_ARCHAIC_CONTRACTIONS = ['tis', 'twas'] @@ -294,7 +294,6 @@ class WordTokenizer(object): if ( self.lower or - self.reduce_words or self.normalize_mentions or self.normalize_hashtags or self.mentions_as_words or @@ -607,6 +606,9 @@ class WordTokenizer(object): if self.unidecode: string = unidecode_expect_ascii(string) + if self.reduce_words: + string = reduce_lenghtening(string) + if self.__only_defaults: yield from self.__tokenize(string) return @@ -647,9 +649,6 @@ class WordTokenizer(object): if self.lower: token_value = token_value.lower() - if self.reduce_words: - token_value = reduce_lenghtening(token_value) - if self.min_word_length is not None and len(token_value) < self.min_word_length: continue