Cleanup in preparation for release. Add docstrings, remove basically empty _constants...
[python-collate.git] / collate / icu / __init__.py
1 """ICU-based collation.
2
3 This collation backend uses the International Components for Unicode
4 library to provide accurate and high-performance collation. It
5 supports multiple locales and advanced sorting capabilities.
6
7 Use this collation backend if possible; it's by far the best.
8
9 Avoid this backend if...
10 - ICU is not available for your system.
11
12 """
13
14 __all__ = ["Collator"]
15
16 import collate._abcollator
17 import collate._locale
18 import collate.errors
19
20 from collate.icu import _icu
21
22 class Collator(collate._abcollator.Collator):
23 """ICU-based collation."""
24
25 def __init__(self, locale, encoding=None):
26 super(Collator, self).__init__(locale, encoding)
27 locale, encoding = collate._locale.getpair(locale, encoding)
28 icu_locale = "root" if locale == "C" else locale
29 self._collator = _icu.Collator(icu_locale)
30 self.locale = self._collator.locale
31 self.encoding = collate._locale.encoding(encoding)
32 if self._collator.used_default_information and locale != "C":
33 raise collate.errors.InvalidLocaleError(locale)
34
35 try:
36 self._breaker = _icu.WordBreaker(icu_locale)
37 except ValueError:
38 # Thai is the only language with a special break locale,
39 # so this is a harmless error.
40 self._breaker = _icu.WordBreaker("root")
41
42 def words(self, string):
43 """Split the string along word boundries."""
44 if isinstance(string, str):
45 string = string.decode(self.encoding)
46 words = self._breaker.words(string)
47 return [w for w in words if not w.isspace()]
48
49 def key(self, string):
50 """Sort key for a string.
51
52 If the string is a str instance, it is decoded to a unicode
53 instance according to the 'encoding' attribute of the
54 Collator.
55 """
56 if isinstance(string, str):
57 string = string.decode(self.encoding, 'replace')
58 return self._collator.key(string)
59