add asciify and update slugify with ascii flag

This commit is contained in:
Mahmoud Hashemi 2013-02-21 02:58:37 -08:00
parent 8f8104daf3
commit e4d74349ff
1 changed files with 138 additions and 3 deletions

View File

@ -1,23 +1,37 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import re import re
import string import string
import unicodedata
from compat import str, unicode, basestring, bytes
_punct_ws_str = string.punctuation + string.whitespace _punct_ws_str = string.punctuation + string.whitespace
_punct_re = re.compile('[' + _punct_ws_str + ']+') _punct_re = re.compile('[' + _punct_ws_str + ']+')
_camel2under_re = re.compile('((?<=[a-z0-9])[A-Z]|(?!^)[A-Z](?=[a-z]))') _camel2under_re = re.compile('((?<=[a-z0-9])[A-Z]|(?!^)[A-Z](?=[a-z]))')
def slugify(text, delim='_'): def slugify(text, delim='_', lower=True, ascii=False):
""" """
A basic function that turns text full of scary characters A basic function that turns text full of scary characters
(i.e., punctuation and whitespace), into a relatively safe (i.e., punctuation and whitespace), into a relatively safe
lowercased string separated only by the delimiter specified lowercased string separated only by the delimiter specified
by 'delim', which defaults to '_'. by `delim`, which defaults to '_'.
The `ascii` convenience flag will asciify the slug if you require
ascii-only slugs.
>>> slugify('First post! Hi!!!!~1 ') >>> slugify('First post! Hi!!!!~1 ')
'first_post_hi_1' 'first_post_hi_1'
>>> slugify("Kurt Gödel's pretty cool.", ascii=True)
'kurt_goedel_s_pretty_cool'
""" """
return delim.join(split_punct_ws(text)).lower() ret = delim.join(split_punct_ws(text))
if ascii:
ret = asciify(ret)
if lower:
ret = ret.lower()
return ret
def split_punct_ws(text): def split_punct_ws(text):
@ -52,3 +66,124 @@ def under2camel(under_string):
'ComplexTokenizer' 'ComplexTokenizer'
""" """
return ''.join(w.capitalize() or '_' for w in under_string.split('_')) return ''.join(w.capitalize() or '_' for w in under_string.split('_'))
def asciify(text, ignore=False):
"""
Converts a unicode or bytestring into a bytestring with
just ascii characters. Performs basic deaccenting for all you
Europhiles out there.
Also, a gentle reminder that this is a _utility_, primarily meant
for slugification. Whenever possible, make your application work
_with_ unicode, not against it.
>>> asciify('Beyoncé')
'Beyonce'
"""
try:
try:
return text.encode('ascii')
except UnicodeDecodeError:
# this usually means you passed in a non-unicode string
text = text.decode('utf-8')
return text.encode('ascii')
except UnicodeEncodeError:
mode = 'replace'
if ignore:
mode = 'ignore'
transd = unicodedata.normalize('NFKD', text.translate(DEACCENT_MAP))
ret = transd.encode('ascii', mode)
return ret
class DeaccenterDict(dict):
def __missing__(self, key):
ch = self.get(key)
if ch is not None:
return ch
try:
de = unicodedata.decomposition(unichr(key))
p1, _, p2 = de.rpartition(' ')
if int(p2, 16) == 0x308:
ch = self.get(key)
else:
ch = int(p1, 16)
except (IndexError, ValueError):
ch = self.get(key, key)
self[key] = ch
return ch
try:
from collections import defaultdict
except ImportError:
# no defaultdict means that __missing__ isn't supported in
# this version of python, so we define __getitem__
def __getitem__(self, key):
try:
return super(DeaccenterDict, self).__getitem__(key)
except KeyError:
return self.__missing__(key)
else:
del defaultdict
_BASE_DEACCENT_MAP = {
0xc6: u"AE", # Æ LATIN CAPITAL LETTER AE
0xd0: u"D", # Ð LATIN CAPITAL LETTER ETH
0xd8: u"OE", # Ø LATIN CAPITAL LETTER O WITH STROKE
0xde: u"Th", # Þ LATIN CAPITAL LETTER THORN
0xc4: u'Ae', # Ä LATIN CAPITAL LETTER A WITH DIAERESIS
0xd6: u'Oe', # Ö LATIN CAPITAL LETTER O WITH DIAERESIS
0xdc: u'Ue', # Ü LATIN CAPITAL LETTER U WITH DIAERESIS
0xc0: u"A", # À LATIN CAPITAL LETTER A WITH GRAVE
0xc1: u"A", # Á LATIN CAPITAL LETTER A WITH ACUTE
0xc3: u"A", # Ã LATIN CAPITAL LETTER A WITH TILDE
0xc7: u"C", # Ç LATIN CAPITAL LETTER C WITH CEDILLA
0xc8: u"E", # È LATIN CAPITAL LETTER E WITH GRAVE
0xc9: u"E", # É LATIN CAPITAL LETTER E WITH ACUTE
0xca: u"E", # Ê LATIN CAPITAL LETTER E WITH CIRCUMFLEX
0xcc: u"I", # Ì LATIN CAPITAL LETTER I WITH GRAVE
0xcd: u"I", # Í LATIN CAPITAL LETTER I WITH ACUTE
0xd2: u"O", # Ò LATIN CAPITAL LETTER O WITH GRAVE
0xd3: u"O", # Ó LATIN CAPITAL LETTER O WITH ACUTE
0xd5: u"O", # Õ LATIN CAPITAL LETTER O WITH TILDE
0xd9: u"U", # Ù LATIN CAPITAL LETTER U WITH GRAVE
0xda: u"U", # Ú LATIN CAPITAL LETTER U WITH ACUTE
0xdf: u"ss", # ß LATIN SMALL LETTER SHARP S
0xe6: u"ae", # æ LATIN SMALL LETTER AE
0xf0: u"d", # ð LATIN SMALL LETTER ETH
0xf8: u"oe", # ø LATIN SMALL LETTER O WITH STROKE
0xfe: u"th", # þ LATIN SMALL LETTER THORN,
0xe4: u'ae', # ä LATIN SMALL LETTER A WITH DIAERESIS
0xf6: u'oe', # ö LATIN SMALL LETTER O WITH DIAERESIS
0xfc: u'ue', # ü LATIN SMALL LETTER U WITH DIAERESIS
0xe0: u"a", # à LATIN SMALL LETTER A WITH GRAVE
0xe1: u"a", # á LATIN SMALL LETTER A WITH ACUTE
0xe3: u"a", # ã LATIN SMALL LETTER A WITH TILDE
0xe7: u"c", # ç LATIN SMALL LETTER C WITH CEDILLA
0xe8: u"e", # è LATIN SMALL LETTER E WITH GRAVE
0xe9: u"e", # é LATIN SMALL LETTER E WITH ACUTE
0xea: u"e", # ê LATIN SMALL LETTER E WITH CIRCUMFLEX
0xec: u"i", # ì LATIN SMALL LETTER I WITH GRAVE
0xed: u"i", # í LATIN SMALL LETTER I WITH ACUTE
0xf2: u"o", # ò LATIN SMALL LETTER O WITH GRAVE
0xf3: u"o", # ó LATIN SMALL LETTER O WITH ACUTE
0xf5: u"o", # õ LATIN SMALL LETTER O WITH TILDE
0xf9: u"u", # ù LATIN SMALL LETTER U WITH GRAVE
0xfa: u"u", # ú LATIN SMALL LETTER U WITH ACUTE
0x2018: u"'", # LEFT SINGLE QUOTATION MARK
0x2019: u"'", # RIGHT SINGLE QUOTATION MARK
0x201c: u'"', # “ LEFT DOUBLE QUOTATION MARK
0x201d: u'"', # ” RIGHT DOUBLE QUOTATION MARK
}
DEACCENT_MAP = DeaccenterDict(_BASE_DEACCENT_MAP)
if __name__ == '__main__':
b = asciify(u'Beyoncé')
print ord(b[-1])
print b
print DEACCENT_MAP