1 """ICU-based collation.
3 This collation backend uses the International Components for Unicode
4 library to provide accurate and high-performance collation. It
5 supports multiple locales and advanced sorting capabilities.
7 Use this collation backend if possible; it's by far the best.
9 Avoid this backend if...
10 - ICU is not available for your system.
14 __all__
= ["Collator"]
16 import collate
._abcollator
17 import collate
._locale
20 from collate
.icu
import _icu
22 class Collator(collate
._abcollator
.Collator
):
23 """ICU-based collation."""
25 def __init__(self
, locale
, encoding
=None):
26 super(Collator
, self
).__init
__(locale
, encoding
)
27 locale
, encoding
= collate
._locale
.getpair(locale
, encoding
)
28 icu_locale
= "root" if locale
== "C" else locale
29 self
._collator
= _icu
.Collator(icu_locale
)
30 self
.locale
= self
._collator
.locale
31 self
.encoding
= collate
._locale
.encoding(encoding
)
32 if self
._collator
.used_default_information
and locale
!= "C":
33 raise collate
.errors
.InvalidLocaleError(locale
)
36 self
._breaker
= _icu
.WordBreaker(icu_locale
)
38 # Thai is the only language with a special break locale,
39 # so this is a harmless error.
40 self
._breaker
= _icu
.WordBreaker("root")
42 def words(self
, string
):
43 """Split the string along word boundries."""
44 string
= self
.unicode(string
)
45 return self
._breaker
.words(string
)
47 def key(self
, string
):
48 """Sort key for a string.
50 If the string is a str instance, it is decoded to a unicode
51 instance according to the 'encoding' attribute of the
54 string
= self
.unicode(string
)
55 return self
._collator
.key(string
)