"""ICU-based collation. This collation backend uses the International Components for Unicode library to provide accurate and high-performance collation. It supports multiple locales and advanced sorting capabilities. Use this collation backend if possible; it's by far the best. Avoid this backend if... - ICU is not available for your system. """ import collate._abcollator import collate._locale import collate.errors from collate.icu import _icu class Collator(collate._abcollator.Collator): """ICU-based collation.""" def __init__(self, locale, encoding=None): locale, encoding = collate._locale.getpair(locale, encoding) icu_locale = "root" if locale == "C" else locale self._collator = _icu.Collator(icu_locale) self.locale = self._collator.locale self.encoding = collate._locale.encoding(encoding) if self._collator.used_default_information and locale != "C": raise collate.errors.InvalidLocaleError(locale) try: self._breaker = _icu.WordBreaker(icu_locale) except ValueError: # Thai is the only language with a special break locale, # so this is a harmless error. self._breaker = _icu.WordBreaker("root") def words(self, string): if isinstance(string, str): string = string.decode(self.encoding, 'replace') return filter(lambda u: not u.isspace(), self._breaker.words(string)) def key(self, string): """Sort key for a string. If the string is a str instance, it is decoded to a unicode instance according to the 'encoding' attribute of the Collator. """ if isinstance(string, str): string = string.decode(self.encoding, 'replace') return self._collator.key(string) def cmp(self, a, b): """Return negative if a < b, zero if a == b, positive if a > b. If strs rather than unicodes are passed in, they are first decoded according to the 'encoding' attribute of the Collator. """ if isinstance(a, str): a = a.decode(self.encoding, 'replace') if isinstance(b, str): b = b.decode(self.encoding, 'replace') return self._collator.cmp(a, b)