"""C library locale-based collation. This collation backend uses the system's C library to sort strings. It is fast and almost always available, but may sort strings outside of the user's native locale incorrectly or confusingly (for example, en_US tends to ignore hiragana characters; ja_JP does not case-fold Latin characters). Since the C library only supports one locale active at a time per process, instantiating a Collator from this module will affect the locale of all previous collators and anything else using the system locale information. Use this collation backend if... - You are on a system without ICU. Avoid this backend if... - ICU is available for the current locale. - You are sorting strings from alphabets outside the primary locale. - You need to support collating multiple locales at once. - You need the same results across multiple platforms. """ import locale import re import collate.errors import collate._abcollator import collate._locale class Collator(collate._abcollator.Collator): """C library locale-based collation.""" def __init__(self, locale_code, encoding=None): locale_code, encoding = collate._locale.getpair(locale_code, encoding) try: setlocale = locale_code + "." + encoding locale.setlocale(locale.LC_COLLATE, setlocale) except locale.Error: raise collate.errors.InvalidLocaleError(setlocale) self.locale = locale.getlocale(locale.LC_COLLATE)[0] self.encoding = collate._locale.encoding(encoding) def key(self, string): """Sort key for a string. If string is a unicode instance that cannot be processed by the system locale library, it is first encoded according to the 'encoding' attribute of the Collator. """ try: return locale.strxfrm(string) except UnicodeEncodeError: return locale.strxfrm(string.encode(self.encoding, "replace")) def cmp(self, a, b): """Return negative if a < b, zero if a == b, positive if a > b. If strs rather than unicodes are passed in, they are first decoded according to the 'encoding' attribute of the Collator. """ if isinstance(a, str): a = a.decode(self.encoding, "replace") if isinstance(b, str): b = b.decode(self.encoding, "replace") return locale.strcoll(a, b) def words(self, string, sep=re.compile(r"\W+", re.UNICODE)): """Split the string into separate words. This split is done using the locale's notion of a word boundry. """ return re.split(sep, string)