X-Git-Url: https://git.korewanetadesu.com/?p=python-collate.git;a=blobdiff_plain;f=collate%2Ficu%2F__init__.py;h=6d27a6fa95cf2b2690d1a2f50174e4819bc27dc1;hp=16ef1e6f044e18574f56234bf8557a3b66978d26;hb=308778ae560a3258a55d578b1dd52d030ce4399d;hpb=c02c5c3c54d35e7d5836adf54aadac1f79906f05 diff --git a/collate/icu/__init__.py b/collate/icu/__init__.py index 16ef1e6..6d27a6f 100644 --- a/collate/icu/__init__.py +++ b/collate/icu/__init__.py @@ -1,12 +1,56 @@ -import collate.icu._ucol +"""ICU-based collation. + +This collation backend uses the International Components for Unicode +library to provide accurate and high-performance collation. It +supports multiple locales and advanced sorting capabilities. + +Use this collation backend if possible; it's by far the best. + +Avoid this backend if... + - ICU is not available for your system. + +""" + +__all__ = ["Collator"] + import collate._abcollator +import collate._locale +import collate.errors + +from collate.icu import _icu class Collator(collate._abcollator.Collator): - def __init__(self, locale): - self._collator = collate.icu._ucol.Collator(locale) + """ICU-based collation.""" + + def __init__(self, locale, encoding=None): + super(Collator, self).__init__(locale, encoding) + locale, encoding = collate._locale.getpair(locale, encoding) + icu_locale = "root" if locale == "C" else locale + self._collator = _icu.Collator(icu_locale) + self.locale = self._collator.locale + self.encoding = collate._locale.encoding(encoding) + if self._collator.used_default_information and locale != "C": + raise collate.errors.InvalidLocaleError(locale) + + try: + self._breaker = _icu.WordBreaker(icu_locale) + except ValueError: + # Thai is the only language with a special break locale, + # so this is a harmless error. + self._breaker = _icu.WordBreaker("root") + + def words(self, string): + """Split the string along word boundries.""" + string = self.unicode(string) + return self._breaker.words(string) def key(self, string): - return self._collator.key(string) + """Sort key for a string. + + If the string is a str instance, it is decoded to a unicode + instance according to the 'encoding' attribute of the + Collator. + """ + string = self.unicode(string) + return self._collator.key(string) - def cmp(self, string1, string2): - return self._collator.cmp(string1, string2)