X-Git-Url: https://git.korewanetadesu.com/?p=python-collate.git;a=blobdiff_plain;f=collate%2Ficu%2F__init__.py;h=6f9647e4141c3238e456a0acdd40a4057da81638;hp=16ef1e6f044e18574f56234bf8557a3b66978d26;hb=2a37219e2d9c0fe58e78d987a21f6e37cfd33940;hpb=c02c5c3c54d35e7d5836adf54aadac1f79906f05 diff --git a/collate/icu/__init__.py b/collate/icu/__init__.py index 16ef1e6..6f9647e 100644 --- a/collate/icu/__init__.py +++ b/collate/icu/__init__.py @@ -1,12 +1,65 @@ -import collate.icu._ucol +"""ICU-based collation. + +This collation backend uses the International Components for Unicode +library to provide accurate and high-performance collation. It +supports multiple locales and advanced sorting capabilities. + +Use this collation backend if possible; it's by far the best. + +Avoid this backend if... + - ICU is not available for your system. + +""" + import collate._abcollator +import collate._locale +import collate.errors + +from collate.icu import _icu class Collator(collate._abcollator.Collator): - def __init__(self, locale): - self._collator = collate.icu._ucol.Collator(locale) + """ICU-based collation.""" + + def __init__(self, locale, encoding=None): + locale, encoding = collate._locale.getpair(locale, encoding) + icu_locale = "root" if locale == "C" else locale + self._collator = _icu.Collator(icu_locale) + self.locale = self._collator.locale + self.encoding = collate._locale.encoding(encoding) + if self._collator.used_default_information and locale != "C": + raise collate.errors.InvalidLocaleError(locale) + + try: + self._breaker = _icu.WordBreaker(icu_locale) + except ValueError: + # Thai is the only language with a special break locale, + # so this is a harmless error. + self._breaker = _icu.WordBreaker("root") + + def words(self, string): + if isinstance(string, str): + string = string.decode(self.encoding, 'replace') + return filter(lambda u: not u.isspace(), self._breaker.words(string)) def key(self, string): - return self._collator.key(string) + """Sort key for a string. + + If the string is a str instance, it is decoded to a unicode + instance according to the 'encoding' attribute of the + Collator. + """ + if isinstance(string, str): + string = string.decode(self.encoding, 'replace') + return self._collator.key(string) + + def cmp(self, a, b): + """Return negative if a < b, zero if a == b, positive if a > b. - def cmp(self, string1, string2): - return self._collator.cmp(string1, string2) + If strs rather than unicodes are passed in, they are first + decoded according to the 'encoding' attribute of the Collator. + """ + if isinstance(a, str): + a = a.decode(self.encoding, 'replace') + if isinstance(b, str): + b = b.decode(self.encoding, 'replace') + return self._collator.cmp(a, b)