X-Git-Url: https://git.korewanetadesu.com/?p=python-collate.git;a=blobdiff_plain;f=collate%2Ficu%2F__init__.py;h=5f3ec05f98f19d051c4c8a6beda8b14fc0968081;hp=eeee418070483d0797b63b639d1aa7bca219442e;hb=9a7cf6459c40d53b58634f2df56386bf52c12f7c;hpb=dd764628164dd6f23c1ec98a13184ce7c69fa40f diff --git a/collate/icu/__init__.py b/collate/icu/__init__.py index eeee418..5f3ec05 100644 --- a/collate/icu/__init__.py +++ b/collate/icu/__init__.py @@ -11,22 +11,38 @@ Avoid this backend if... """ -import collate.icu._ucol import collate._abcollator import collate._locale import collate.errors +from collate.icu import _icu + class Collator(collate._abcollator.Collator): """ICU-based collation.""" def __init__(self, locale, encoding=None): locale, encoding = collate._locale.getpair(locale, encoding) - self._collator = collate.icu._ucol.Collator(locale) + icu_locale = "root" if locale == "C" else locale + self._collator = _icu.Collator(icu_locale) self.locale = self._collator.locale self.encoding = collate._locale.encoding(encoding) if self._collator.used_default_information and locale != "C": raise collate.errors.InvalidLocaleError(locale) + try: + self._breaker = _icu.WordBreaker(icu_locale) + except ValueError: + # Thai is the only language with a special break locale, + # so this is a harmless error. + self._breaker = _icu.WordBreaker("root") + + def words(self, string): + """Split the string along word boundries.""" + if isinstance(string, str): + string = string.decode(self.encoding) + words = self._breaker.words(string) + return [w for w in words if not w.isspace()] + def key(self, string): """Sort key for a string. @@ -47,5 +63,5 @@ class Collator(collate._abcollator.Collator): if isinstance(a, str): a = a.decode(self.encoding, 'replace') if isinstance(b, str): - b = a.decode(self.encoding, 'replace') + b = b.decode(self.encoding, 'replace') return self._collator.cmp(a, b)