1 """C library locale-based collation.
3 This collation backend uses the system's C library to sort strings. It
4 is fast and almost always available, but may sort strings outside of
5 the user's native locale incorrectly or confusingly (for example,
6 en_US tends to ignore hiragana characters; ja_JP does not case-fold
9 Since the C library only supports one locale active at a time per
10 process, instantiating a Collator from this module will affect the
11 locale of all previous collators and anything else using the system
14 Use this collation backend if...
15 - You are on a system without ICU.
17 Avoid this backend if...
18 - ICU is available for the current locale.
19 - You are sorting strings from alphabets outside the primary locale.
20 - You need to support collating multiple locales at once.
21 - You need the same results across multiple platforms.
29 import collate
._abcollator
30 import collate
._locale
32 class Collator(collate
._abcollator
.Collator
):
33 """C library locale-based collation."""
35 def __init__(self
, locale_code
, encoding
=None):
36 locale_code
, encoding
= collate
._locale
.getpair(locale_code
, encoding
)
38 setlocale
= locale_code
+ "." + encoding
39 locale
.setlocale(locale
.LC_COLLATE
, setlocale
)
41 raise collate
.errors
.InvalidLocaleError(setlocale
)
42 self
.locale
= locale
.getlocale(locale
.LC_COLLATE
)[0]
43 self
.encoding
= collate
._locale
.encoding(encoding
)
45 def key(self
, string
):
46 """Sort key for a string.
48 If string is a unicode instance that cannot be processed by
49 the system locale library, it is first encoded according to
50 the 'encoding' attribute of the Collator.
53 return locale
.strxfrm(string
)
54 except UnicodeEncodeError:
55 return locale
.strxfrm(string
.encode(self
.encoding
, "replace"))
58 """Return negative if a < b, zero if a == b, positive if a > b.
60 If strs rather than unicodes are passed in, they are first
61 decoded according to the 'encoding' attribute of the Collator.
63 if isinstance(a
, str):
64 a
= a
.decode(self
.encoding
, "replace")
65 if isinstance(b
, str):
66 b
= b
.decode(self
.encoding
, "replace")
67 return locale
.strcoll(a
, b
)
69 def words(self
, string
, sep
=re
.compile(r
"\W+", re
.UNICODE
)):
70 """Split the string into separate words.
72 This split is done using the locale's notion of a word boundry.
74 return re
.split(sep
, string
)