5b8adcac87a22a5db3d815289d8d1880a7284163
[python-collate.git] / collate / syslocale.py
1 """C library locale-based collation.
2
3 This collation backend uses the system's C library to sort strings. It
4 is fast and almost always available, but may sort strings outside of
5 the user's native locale incorrectly or confusingly (for example,
6 en_US tends to ignore hiragana characters; ja_JP does not case-fold
7 Latin characters).
8
9 Since the C library only supports one locale active at a time per
10 process, instantiating a Collator from this module will affect the
11 locale of all previous collators and anything else using the system
12 locale information.
13
14 Use this collation backend if...
15 - You are on a system without ICU.
16
17 Avoid this backend if...
18 - ICU is available for the current locale.
19 - You are sorting strings from alphabets outside the primary locale.
20 - You need to support collating multiple locales at once.
21 - You need the same results across multiple platforms.
22
23 """
24
25 __all__ = ["Collator"]
26
27 import locale
28 import re
29
30 import collate.errors
31 import collate._abcollator
32 import collate._locale
33
34 class Collator(collate._abcollator.Collator):
35 """C library locale-based collation."""
36
37 def __init__(self, locale_code, encoding=None):
38 super(Collator, self).__init__(locale, encoding)
39 locale_code, encoding = collate._locale.getpair(locale_code, encoding)
40 try:
41 setlocale = locale_code + "." + encoding
42 locale.setlocale(locale.LC_COLLATE, setlocale)
43 except locale.Error:
44 raise collate.errors.InvalidLocaleError(setlocale)
45 self.locale = locale.getlocale(locale.LC_COLLATE)[0]
46 self.encoding = collate._locale.encoding(encoding)
47
48 def key(self, string):
49 """Sort key for a string.
50
51 If string is a unicode instance that cannot be processed by
52 the system locale library, it is first encoded according to
53 the 'encoding' attribute of the Collator.
54 """
55 try:
56 return locale.strxfrm(string)
57 except UnicodeEncodeError:
58 return locale.strxfrm(string.encode(self.encoding, "replace"))
59
60 def words(self, string, sep=re.compile(r"\W+", re.UNICODE)):
61 """Split the string into separate words."""
62 if isinstance(string, str):
63 string = string.decode(self.encoding, 'replace')
64 return re.split(sep, string)
65