-import collate.icu._ucol
+"""ICU-based collation.
+
+This collation backend uses the International Components for Unicode
+library to provide accurate and high-performance collation. It
+supports multiple locales and advanced sorting capabilities.
+
+Use this collation backend if possible; it's by far the best.
+
+Avoid this backend if...
+ - ICU is not available for your system.
+
+"""
+
+__all__ = ["Collator"]
+
import collate._abcollator
+import collate._locale
+import collate.errors
-NAME = "ICU"
+from collate.icu import _icu
class Collator(collate._abcollator.Collator):
- def __init__(self, locale, encoding):
- self._collator = collate.icu._ucol.Collator(locale)
- self.locale = self._collator.locale
+ """ICU-based collation."""
+
+ def __init__(self, locale, encoding=None):
+ super(Collator, self).__init__(locale, encoding)
+ locale, encoding = collate._locale.getpair(locale, encoding)
+ icu_locale = "root" if locale == "C" else locale
+ self._collator = _icu.Collator(icu_locale)
+ self.locale = self._collator.locale
+ self.encoding = collate._locale.encoding(encoding)
+ if self._collator.used_default_information and locale != "C":
+ raise collate.errors.InvalidLocaleError(locale)
+
+ try:
+ self._breaker = _icu.WordBreaker(icu_locale)
+ except ValueError:
+ # Thai is the only language with a special break locale,
+ # so this is a harmless error.
+ self._breaker = _icu.WordBreaker("root")
+
+ def words(self, string):
+ """Split the string along word boundries."""
+ string = self.unicode(string)
+ return self._breaker.words(string)
def key(self, string):
- return self._collator.key(string)
+ """Sort key for a string.
+
+ If the string is a str instance, it is decoded to a unicode
+ instance according to the 'encoding' attribute of the
+ Collator.
+ """
+ string = self.unicode(string)
+ return self._collator.key(string)
- def cmp(self, string1, string2):
- return self._collator.cmp(string1, string2)