1 """C library locale-based collation.
3 This collation backend uses the system's C library to sort strings. It
4 is fast and almost always available, but may sort strings outside of
5 the user's native locale incorrectly or confusingly (for example,
6 en_US tends to ignore hiragana characters; ja_JP does not case-fold
9 Since the C library only supports one locale active at a time per
10 process, instantiating a Collator from this module will affect the
11 locale of all previous collators and anything else using the system
14 Use this collation backend if...
15 - You are on a system without ICU.
17 Avoid this backend if...
18 - ICU is available for the current locale.
19 - You are sorting strings from alphabets outside the primary locale.
20 - You need to support collating multiple locales at once.
21 - You need the same results across multiple platforms.
25 __all__
= ["Collator"]
31 import collate
._abcollator
32 import collate
._locale
34 class Collator(collate
._abcollator
.Collator
):
35 """C library locale-based collation."""
37 def __init__(self
, locale_code
, encoding
=None):
38 super(Collator
, self
).__init
__(locale
, encoding
)
39 locale_code
, encoding
= collate
._locale
.getpair(locale_code
, encoding
)
41 setlocale
= locale_code
+ "." + encoding
42 locale
.setlocale(locale
.LC_COLLATE
, setlocale
)
44 raise collate
.errors
.InvalidLocaleError(setlocale
)
45 self
.locale
= locale
.getlocale(locale
.LC_COLLATE
)[0]
46 self
.encoding
= collate
._locale
.encoding(encoding
)
48 def key(self
, string
):
49 """Sort key for a string.
51 If string is a unicode instance that cannot be processed by
52 the system locale library, it is first encoded according to
53 the 'encoding' attribute of the Collator.
56 return locale
.strxfrm(string
)
57 except UnicodeEncodeError:
58 return locale
.strxfrm(string
.encode(self
.encoding
, "replace"))
60 def words(self
, string
, sep
=re
.compile(r
"\W+", re
.UNICODE
)):
61 """Split the string into separate words."""
62 if isinstance(string
, str):
63 string
= string
.decode(self
.encoding
, 'replace')
64 return re
.split(sep
, string
)