1 """ICU-based collation.
3 This collation backend uses the International Components for Unicode
4 library to provide accurate and high-performance collation. It
5 supports multiple locales and advanced sorting capabilities.
7 Use this collation backend if possible; it's by far the best.
9 Avoid this backend if...
10 - ICU is not available for your system.
14 import collate
._abcollator
15 import collate
._locale
18 from collate
.icu
import _icu
20 class Collator(collate
._abcollator
.Collator
):
21 """ICU-based collation."""
23 def __init__(self
, locale
, encoding
=None):
24 locale
, encoding
= collate
._locale
.getpair(locale
, encoding
)
25 icu_locale
= "root" if locale
== "C" else locale
26 self
._collator
= _icu
.Collator(icu_locale
)
27 self
.locale
= self
._collator
.locale
28 self
.encoding
= collate
._locale
.encoding(encoding
)
29 if self
._collator
.used_default_information
and locale
!= "C":
30 raise collate
.errors
.InvalidLocaleError(locale
)
33 self
._breaker
= _icu
.WordBreaker(icu_locale
)
35 # Thai is the only language with a special break locale,
36 # so this is a harmless error.
37 self
._breaker
= _icu
.WordBreaker("root")
39 def words(self
, string
):
40 if isinstance(string
, str):
41 string
= string
.decode(self
.encoding
, 'replace')
42 return filter(lambda u
: not u
.isspace(), self
._breaker
.words(string
))
44 def key(self
, string
):
45 """Sort key for a string.
47 If the string is a str instance, it is decoded to a unicode
48 instance according to the 'encoding' attribute of the
51 if isinstance(string
, str):
52 string
= string
.decode(self
.encoding
, 'replace')
53 return self
._collator
.key(string
)
56 """Return negative if a < b, zero if a == b, positive if a > b.
58 If strs rather than unicodes are passed in, they are first
59 decoded according to the 'encoding' attribute of the Collator.
61 if isinstance(a
, str):
62 a
= a
.decode(self
.encoding
, 'replace')
63 if isinstance(b
, str):
64 b
= a
.decode(self
.encoding
, 'replace')
65 return self
._collator
.cmp(a
, b
)