'Advanced' sorteme functions.
[python-collate.git] / collate / syslocale.py
1 """C library locale-based collation.
2
3 This collation backend uses the system's C library to sort strings. It
4 is fast and almost always available, but may sort strings outside of
5 the user's native locale incorrectly or confusingly (for example,
6 en_US tends to ignore hiragana characters; ja_JP does not case-fold
7 Latin characters).
8
9 Since the C library only supports one locale active at a time per
10 process, instantiating a Collator from this module will affect the
11 locale of all previous collators and anything else using the system
12 locale information.
13
14 Use this collation backend if...
15 - You are on a system without ICU.
16
17 Avoid this backend if...
18 - ICU is available for the current locale.
19 - You are sorting strings from alphabets outside the primary locale.
20 - You need to support collating multiple locales at once.
21 - You need the same results across multiple platforms.
22
23 """
24
25 import locale
26 import re
27
28 import collate.errors
29 import collate._abcollator
30 import collate._locale
31
32 class Collator(collate._abcollator.Collator):
33 """C library locale-based collation."""
34
35 def __init__(self, locale_code, encoding=None):
36 locale_code, encoding = collate._locale.getpair(locale_code, encoding)
37 try:
38 setlocale = locale_code + "." + encoding
39 locale.setlocale(locale.LC_COLLATE, setlocale)
40 except locale.Error:
41 raise collate.errors.InvalidLocaleError(setlocale)
42 self.locale = locale.getlocale(locale.LC_COLLATE)[0]
43 self.encoding = collate._locale.encoding(encoding)
44
45 def key(self, string):
46 """Sort key for a string.
47
48 If string is a unicode instance that cannot be processed by
49 the system locale library, it is first encoded according to
50 the 'encoding' attribute of the Collator.
51 """
52 try:
53 return locale.strxfrm(string)
54 except UnicodeEncodeError:
55 return locale.strxfrm(string.encode(self.encoding, "replace"))
56
57 def cmp(self, a, b):
58 """Return negative if a < b, zero if a == b, positive if a > b.
59
60 If strs rather than unicodes are passed in, they are first
61 decoded according to the 'encoding' attribute of the Collator.
62 """
63 if isinstance(a, str):
64 a = a.decode(self.encoding, "replace")
65 if isinstance(b, str):
66 b = b.decode(self.encoding, "replace")
67 return locale.strcoll(a, b)
68
69 def words(self, string, sep=re.compile(r"\W+", re.UNICODE)):
70 """Split the string into separate words.
71
72 This split is done using the locale's notion of a word boundry.
73 """
74 return re.split(sep, string)
75