/ Published in: Python
Useful when creating canonical forms of strings for indexing.
Expand |
Embed | Plain Text
Copy this code and paste it in your HTML
reCombining = re.compile(u'[\u0300-\u036f\u1dc0-\u1dff\u20d0-\u20ff\ufe20-\ufe2f]',re.U) def remove_diacritics(s): " Decomposes string, then removes combining characters " return reCombining.sub('',unicodedata.normalize('NFD',unicode(s)) )