X-Git-Url: https://git.korewanetadesu.com/?p=python-collate.git;a=blobdiff_plain;f=collate%2Ficu%2F__init__.py;h=6d27a6fa95cf2b2690d1a2f50174e4819bc27dc1;hp=bb0dcd83bf2087c9162e2b0f5508f64ce4213880;hb=HEAD;hpb=f7fd328bfc2886f6aed2c09b84cc1e039c7c3240 diff --git a/collate/icu/__init__.py b/collate/icu/__init__.py index bb0dcd8..6d27a6f 100644 --- a/collate/icu/__init__.py +++ b/collate/icu/__init__.py @@ -1,15 +1,56 @@ -import collate.icu._ucol +"""ICU-based collation. + +This collation backend uses the International Components for Unicode +library to provide accurate and high-performance collation. It +supports multiple locales and advanced sorting capabilities. + +Use this collation backend if possible; it's by far the best. + +Avoid this backend if... + - ICU is not available for your system. + +""" + +__all__ = ["Collator"] + import collate._abcollator +import collate._locale +import collate.errors -NAME = "ICU" +from collate.icu import _icu class Collator(collate._abcollator.Collator): - def __init__(self, locale, encoding): - self._collator = collate.icu._ucol.Collator(locale) - self.locale = self._collator.locale + """ICU-based collation.""" + + def __init__(self, locale, encoding=None): + super(Collator, self).__init__(locale, encoding) + locale, encoding = collate._locale.getpair(locale, encoding) + icu_locale = "root" if locale == "C" else locale + self._collator = _icu.Collator(icu_locale) + self.locale = self._collator.locale + self.encoding = collate._locale.encoding(encoding) + if self._collator.used_default_information and locale != "C": + raise collate.errors.InvalidLocaleError(locale) + + try: + self._breaker = _icu.WordBreaker(icu_locale) + except ValueError: + # Thai is the only language with a special break locale, + # so this is a harmless error. + self._breaker = _icu.WordBreaker("root") + + def words(self, string): + """Split the string along word boundries.""" + string = self.unicode(string) + return self._breaker.words(string) def key(self, string): - return self._collator.key(string) + """Sort key for a string. + + If the string is a str instance, it is decoded to a unicode + instance according to the 'encoding' attribute of the + Collator. + """ + string = self.unicode(string) + return self._collator.key(string) - def cmp(self, string1, string2): - return self._collator.cmp(string1, string2)