collate/_abcollator.py

   1 """Abstract base collator."""
   2
   3 import re
   4
   5 import collate.strings
   6
   7 class Collator(object):
   8     """Abstract base class for Collators.
   9
  10     Attributes:
  11     locale - the collator follows rules for this locale
  12     encoding - assumed string encoding
  13     """
  14
  15     locale = "C"
  16     encoding = "ascii"
  17
  18     def __init__(self, locale=None, encoding=None):
  19         pass
  20
  21     def cmp(self, string1, string2):
  22         """Return negative if a < b, zero if a == b, positive if a > b."""
  23         return cmp(self.key(string1), self.key(string2))
  24
  25     def key(self, string):
  26         """Return a good sorting key for the string.
  27
  28         The sort key should be considered an opaque value which is
  29         only meaningful when compared to other sort keys from the same
  30         collator.
  31         """
  32         return self.unicode(string)
  33
  34     def words(self, string, sep=re.compile(r"(\s+)", re.UNICODE)):
  35         """Split the string into separate words."""
  36         return re.split(sep, self.unicode(string))
  37
  38     def sortemekey(self, string):
  39         """Return a key based on sortemes of a string.
  40
  41         A sorteme, by analogy with grapheme/morpheme/etc. is an atom
  42         of sort information. This is larger than a word boundry but
  43         smaller than a sentence boundry; roughly, a sorteme boundry
  44         occurs between letters and numbers, between numbers and
  45         numbers if 'too much' punctuation exists in between, between
  46         lines.
  47         """
  48         string = self.unicode(string)
  49         # Shove the sortkeyed original string on the end to resolve
  50         # ties intelligently.
  51         return (collate.strings.sortemes(string, self.key),
  52                 self.key(string))
  53
  54     def unicode(self, string):
  55         """Convert a str to a unicode using the collator encoding."""
  56         try:
  57             return unicode(string)
  58         except UnicodeError:
  59             return string.decode(self.encoding, 'replace')
  60
  61     def str(self, string):
  62         """Convert a unicode to a str using the collator encoding."""
  63         try:
  64             return str(string)
  65         except UnicodeError:
  66             return string.encode(self.encoding, 'replace')
  67
  68     def lstripwords(
  69         self, string, strip=collate.strings.INITIAL_STOPS, append=u", "):
  70         """Strip words and whitespace from the start of a string.
  71
  72         If append is not empty, it and the words stripped from the
  73         front are appended to the end.
  74         """
  75         string = self.unicode(string)
  76         stripped = []
  77         words = self.words(string)
  78         while words and (words[0].isspace() or words[0].lower() in strip):
  79             stripped.append(words.pop(0))
  80         while stripped and stripped[-1].isspace():
  81             stripped.pop()
  82         if append and stripped:
  83             if words:
  84                 words.append(append)
  85             words.extend(stripped)
  86         return u"".join(words)
  87
  88     def lstripsortemekey(
  89         self, string, strip=collate.strings.INITIAL_STOPS, append=u", "):
  90         """Return a key based on sortemes of a prefix-stripped string."""
  91         string = self.unicode(string)
  92         stripped = self.lstripwords(string, strip, append)
  93         return (self.sortemekey(stripped), self.key(string))