+"""Abstract base collator."""
+
+import re
+
+import collate.strings
+
class Collator(object):
+ """Abstract base class for Collators.
+
+ Attributes:
+ locale - the collator follows rules for this locale
+ encoding - assumed string encoding
+ """
+
+ locale = "C"
+ encoding = "ascii"
+
+ def __init__(self, locale=None, encoding=None):
+ pass
+
def cmp(self, string1, string2):
"""Return negative if a < b, zero if a == b, positive if a > b."""
return cmp(self.key(string1), self.key(string2))
+
+ def key(self, string):
+ """Return a good sorting key for the string.
+
+ The sort key should be considered an opaque value which is
+ only meaningful when compared to other sort keys from the same
+ collator.
+ """
+ return self.unicode(string)
+
+ def words(self, string, sep=re.compile(r"(\s+)", re.UNICODE)):
+ """Split the string into separate words."""
+ return re.split(sep, self.unicode(string))
+
+ def sortemekey(self, string):
+ """Return a key based on sortemes of a string.
+
+ A sorteme, by analogy with grapheme/morpheme/etc. is an atom
+ of sort information. This is larger than a word boundry but
+ smaller than a sentence boundry; roughly, a sorteme boundry
+ occurs between letters and numbers, between numbers and
+ numbers if 'too much' punctuation exists in between, between
+ lines.
+ """
+ string = self.unicode(string)
+ # Shove the sortkeyed original string on the end to resolve
+ # ties intelligently.
+ return (collate.strings.sortemes(string, self.key),
+ self.key(string))
+
+ def unicode(self, string):
+ """Convert a str to a unicode using the collator encoding."""
+ try:
+ return unicode(string)
+ except UnicodeError:
+ return string.decode(self.encoding, 'replace')
+
+ def str(self, string):
+ """Convert a unicode to a str using the collator encoding."""
+ try:
+ return str(string)
+ except UnicodeError:
+ return string.encode(self.encoding, 'replace')
+
+ def lstripwords(
+ self, string, strip=collate.strings.INITIAL_STOPS, append=u", "):
+ """Strip words and whitespace from the start of a string.
+
+ If append is not empty, it and the words stripped from the
+ front are appended to the end.
+ """
+ string = self.unicode(string)
+ stripped = []
+ words = self.words(string)
+ while words and (words[0].isspace() or words[0].lower() in strip):
+ stripped.append(words.pop(0))
+ while stripped and stripped[-1].isspace():
+ stripped.pop()
+ if append and stripped:
+ if words:
+ words.append(append)
+ words.extend(stripped)
+ return u"".join(words)
+
+ def lstripsortemekey(
+ self, string, strip=collate.strings.INITIAL_STOPS, append=u", "):
+ """Return a key based on sortemes of a prefix-stripped string."""
+ string = self.unicode(string)
+ stripped = self.lstripwords(string, strip, append)
+ return (self.sortemekey(stripped), self.key(string))