5f3ec05f98f19d051c4c8a6beda8b14fc0968081
[python-collate.git] / collate / icu / __init__.py
1 """ICU-based collation.
2
3 This collation backend uses the International Components for Unicode
4 library to provide accurate and high-performance collation. It
5 supports multiple locales and advanced sorting capabilities.
6
7 Use this collation backend if possible; it's by far the best.
8
9 Avoid this backend if...
10 - ICU is not available for your system.
11
12 """
13
14 import collate._abcollator
15 import collate._locale
16 import collate.errors
17
18 from collate.icu import _icu
19
20 class Collator(collate._abcollator.Collator):
21 """ICU-based collation."""
22
23 def __init__(self, locale, encoding=None):
24 locale, encoding = collate._locale.getpair(locale, encoding)
25 icu_locale = "root" if locale == "C" else locale
26 self._collator = _icu.Collator(icu_locale)
27 self.locale = self._collator.locale
28 self.encoding = collate._locale.encoding(encoding)
29 if self._collator.used_default_information and locale != "C":
30 raise collate.errors.InvalidLocaleError(locale)
31
32 try:
33 self._breaker = _icu.WordBreaker(icu_locale)
34 except ValueError:
35 # Thai is the only language with a special break locale,
36 # so this is a harmless error.
37 self._breaker = _icu.WordBreaker("root")
38
39 def words(self, string):
40 """Split the string along word boundries."""
41 if isinstance(string, str):
42 string = string.decode(self.encoding)
43 words = self._breaker.words(string)
44 return [w for w in words if not w.isspace()]
45
46 def key(self, string):
47 """Sort key for a string.
48
49 If the string is a str instance, it is decoded to a unicode
50 instance according to the 'encoding' attribute of the
51 Collator.
52 """
53 if isinstance(string, str):
54 string = string.decode(self.encoding, 'replace')
55 return self._collator.key(string)
56
57 def cmp(self, a, b):
58 """Return negative if a < b, zero if a == b, positive if a > b.
59
60 If strs rather than unicodes are passed in, they are first
61 decoded according to the 'encoding' attribute of the Collator.
62 """
63 if isinstance(a, str):
64 a = a.decode(self.encoding, 'replace')
65 if isinstance(b, str):
66 b = b.decode(self.encoding, 'replace')
67 return self._collator.cmp(a, b)