syslocale: setlocale for LC_COLLATE requires encoding to be considered valid.
[python-collate.git] / collate / syslocale.py
1 """C library locale-based collation.
2
3 This collation backend uses the system's C library to sort strings. It
4 is fast and almost always available, but may sort strings outside of
5 the user's native locale incorrectly or confusingly (for example,
6 en_US tends to ignore hiragana characters; ja_JP does not case-fold
7 Latin characters).
8
9 Since the C library only supports one locale active at a time per
10 process, instantiating a Collator from this module will affect the
11 locale of all previous collators and anything else using the system
12 locale information.
13
14 Use this collation backend if...
15 - You are on a system without ICU or UCA datafiles for the locale,
16 and DUCET results are not acceptable.
17
18 Avoid this backend if...
19 - ICU or UCA support is available for the current locale.
20 - You are sorting strings from alphabets outside the primary locale.
21 - You need to support collating multiple locales at once.
22 - You need the same results across multiple platforms.
23
24 """
25
26 import locale
27
28 import collate.errors
29 import collate._abcollator
30 import collate._locale
31
32 class Collator(collate._abcollator.Collator):
33 """C library locale-based collation."""
34
35 def __init__(self, locale_code, encoding=None):
36 locale_code, encoding = collate._locale.getpair(locale_code, encoding)
37 try:
38 setlocale = locale_code + "." + encoding
39 locale.setlocale(locale.LC_COLLATE, setlocale)
40 except locale.Error:
41 raise collate.errors.InvalidLocaleError(setlocale)
42 self.locale = locale.getlocale(locale.LC_COLLATE)[0]
43 self.encoding = collate._locale.encoding(encoding)
44
45 def key(self, string):
46 """Sort key for a string.
47
48 If string is a unicode instance that cannot be processed by
49 the system locale library, it is first encoded according to
50 the 'encoding' attribute of the Collator.
51 """
52 try:
53 return locale.strxfrm(string)
54 except UnicodeEncodeError:
55 return locale.strxfrm(string.encode(self.encoding, "replace"))
56
57 def cmp(self, a, b):
58 """Return negative if a < b, zero if a == b, positive if a > b.
59
60 If strs rather than unicodes are passed in, they are first
61 decoded according to the 'encoding' attribute of the Collator.
62 """
63 if isinstance(a, str):
64 a = a.decode(self.encoding, "replace")
65 if isinstance(b, str):
66 b = b.decode(self.encoding, "replace")
67 return locale.strcoll(a, b)
68