collate/syslocale.py

   1 """C library locale-based collation.
   2
   3 This collation backend uses the system's C library to sort strings. It
   4 is fast and almost always available, but may sort strings outside of
   5 the user's native locale incorrectly or confusingly (for example,
   6 en_US tends to ignore hiragana characters; ja_JP does not case-fold
   7 Latin characters).
   8
   9 Since the C library only supports one locale active at a time per
  10 process, instantiating a Collator from this module will affect the
  11 locale of all previous collators and anything else using the system
  12 locale information.
  13
  14 Use this collation backend if...
  15  - You are on a system without ICU.
  16
  17 Avoid this backend if...
  18  - ICU is available for the current locale.
  19  - You are sorting strings from alphabets outside the primary locale.
  20  - You need to support collating multiple locales at once.
  21  - You need the same results across multiple platforms.
  22
  23 """
  24
  25 __all__ = ["Collator"]
  26
  27 import locale
  28 import re
  29
  30 import collate.errors
  31 import collate._abcollator
  32 import collate._locale
  33
  34 class Collator(collate._abcollator.Collator):
  35     """C library locale-based collation."""
  36
  37     def __init__(self, locale_code, encoding=None):
  38         super(Collator, self).__init__(locale, encoding)
  39         locale_code, encoding = collate._locale.getpair(locale_code, encoding)
  40         try:
  41             setlocale = locale_code + "." + encoding
  42             locale.setlocale(locale.LC_COLLATE, setlocale)
  43         except locale.Error:
  44             raise collate.errors.InvalidLocaleError(setlocale)
  45         self.locale = locale.getlocale(locale.LC_COLLATE)[0]
  46         self.encoding = collate._locale.encoding(encoding)
  47
  48     def key(self, string):
  49         """Sort key for a string.
  50
  51         If string is a unicode instance that cannot be processed by
  52         the system locale library, it is first encoded according to
  53         the 'encoding' attribute of the Collator.
  54         """
  55         try:
  56             return locale.strxfrm(string)
  57         except UnicodeEncodeError:
  58             return locale.strxfrm(string.encode(self.encoding, "replace"))
  59
  60     def words(self, string, sep=re.compile(r"\W+", re.UNICODE)):
  61         """Split the string into separate words."""
  62         if isinstance(string, str):
  63             string = string.decode(self.encoding, 'replace')
  64         return re.split(sep, string)
  65