mirror of https://github.com/Yomguithereal/fog.git
parent
de662d5cb8
commit
809176ffb6
|
@ -40,7 +40,7 @@ EMAIL_LOOKAHEAD_RE = re.compile(r'^[A-Za-z0-9!#$%&*+\-/=?^_`{|}~]{1,64}@')
|
||||||
SMILEY_RE = re.compile(r'^(?:[\-]+>|<[\-]+|[<>]?[:;=8][\-o\*\']?[\)\]\(\[dDpP/\:\}\{@\|\\]|[\)\]\(\[dDpP/\:\}\{@\|\\][\-o\*\']?[:;=8]|[<:]3|\^\^)')
|
SMILEY_RE = re.compile(r'^(?:[\-]+>|<[\-]+|[<>]?[:;=8][\-o\*\']?[\)\]\(\[dDpP/\:\}\{@\|\\]|[\)\]\(\[dDpP/\:\}\{@\|\\][\-o\*\']?[:;=8]|[<:]3|\^\^)')
|
||||||
EMOJI_RE = get_emoji_regexp()
|
EMOJI_RE = get_emoji_regexp()
|
||||||
POINT_SPLITTER_RE = re.compile(r'(\.)')
|
POINT_SPLITTER_RE = re.compile(r'(\.)')
|
||||||
LENGTHENING_RE = re.compile(r'(.)\1{4,}')
|
LENGTHENING_RE = re.compile(r'([^\W\d])\1{4,}')
|
||||||
|
|
||||||
ENGLISH_CONTRACTIONS = ['ll', 're', 'm', 's', 've', 'd']
|
ENGLISH_CONTRACTIONS = ['ll', 're', 'm', 's', 've', 'd']
|
||||||
ENGLISH_ARCHAIC_CONTRACTIONS = ['tis', 'twas']
|
ENGLISH_ARCHAIC_CONTRACTIONS = ['tis', 'twas']
|
||||||
|
@ -294,7 +294,6 @@ class WordTokenizer(object):
|
||||||
|
|
||||||
if (
|
if (
|
||||||
self.lower or
|
self.lower or
|
||||||
self.reduce_words or
|
|
||||||
self.normalize_mentions or
|
self.normalize_mentions or
|
||||||
self.normalize_hashtags or
|
self.normalize_hashtags or
|
||||||
self.mentions_as_words or
|
self.mentions_as_words or
|
||||||
|
@ -607,6 +606,9 @@ class WordTokenizer(object):
|
||||||
if self.unidecode:
|
if self.unidecode:
|
||||||
string = unidecode_expect_ascii(string)
|
string = unidecode_expect_ascii(string)
|
||||||
|
|
||||||
|
if self.reduce_words:
|
||||||
|
string = reduce_lenghtening(string)
|
||||||
|
|
||||||
if self.__only_defaults:
|
if self.__only_defaults:
|
||||||
yield from self.__tokenize(string)
|
yield from self.__tokenize(string)
|
||||||
return
|
return
|
||||||
|
@ -647,9 +649,6 @@ class WordTokenizer(object):
|
||||||
if self.lower:
|
if self.lower:
|
||||||
token_value = token_value.lower()
|
token_value = token_value.lower()
|
||||||
|
|
||||||
if self.reduce_words:
|
|
||||||
token_value = reduce_lenghtening(token_value)
|
|
||||||
|
|
||||||
if self.min_word_length is not None and len(token_value) < self.min_word_length:
|
if self.min_word_length is not None and len(token_value) < self.min_word_length:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue