X-Git-Url: https://git.korewanetadesu.com/?p=python-collate.git;a=blobdiff_plain;f=collate%2F_abcollator.py;h=dd7ea1430445548736e8f3d754ab9325210ac117;hp=12575eaed844603f039009a62262fe66b8d29ace;hb=308778ae560a3258a55d578b1dd52d030ce4399d;hpb=92fc0878bc7b75741a3434d17310e390a9304e70

diff --git a/collate/_abcollator.py b/collate/_abcollator.py
index 12575ea..dd7ea14 100644
--- a/collate/_abcollator.py
+++ b/collate/_abcollator.py
@@ -1,33 +1,93 @@
+"""Abstract base collator."""
+
+import re
+
 import collate.strings
 
 class Collator(object):
+    """Abstract base class for Collators.
+
+    Attributes:
+    locale - the collator follows rules for this locale
+    encoding - assumed string encoding
+    """
+
+    locale = "C"
     encoding = "ascii"
 
+    def __init__(self, locale=None, encoding=None):
+        pass
+
     def cmp(self, string1, string2):
         """Return negative if a < b, zero if a == b, positive if a > b."""
         return cmp(self.key(string1), self.key(string2))
 
     def key(self, string):
-        return string
+        """Return a good sorting key for the string.
+        
+        The sort key should be considered an opaque value which is
+        only meaningful when compared to other sort keys from the same
+        collator.
+        """
+        return self.unicode(string)
 
-    def words(self, string):
-        """Split the string along word boundries."""
-        if isinstance(string, str):
-            string = string.decode(self.encoding, 'replace')
-        return string.split()
+    def words(self, string, sep=re.compile(r"(\s+)", re.UNICODE)):
+        """Split the string into separate words."""
+        return re.split(sep, self.unicode(string))
 
     def sortemekey(self, string):
         """Return a key based on sortemes of a string.
 
-        If the string is a str instance, it is decoded to a unicode
-        instance according to the 'encoding' attribute of the
-        Collator.
+        A sorteme, by analogy with grapheme/morpheme/etc. is an atom
+        of sort information. This is larger than a word boundry but
+        smaller than a sentence boundry; roughly, a sorteme boundry
+        occurs between letters and numbers, between numbers and
+        numbers if 'too much' punctuation exists in between, between
+        lines.
         """
-        if isinstance(string, str):
-            string = string.decode(self.encoding, 'replace')
-
+        string = self.unicode(string)
         # Shove the sortkeyed original string on the end to resolve
         # ties intelligently.
         return (collate.strings.sortemes(string, self.key),
                 self.key(string))
 
+    def unicode(self, string):
+        """Convert a str to a unicode using the collator encoding."""
+        try:
+            return unicode(string)
+        except UnicodeError:
+            return string.decode(self.encoding, 'replace')
+
+    def str(self, string):
+        """Convert a unicode to a str using the collator encoding."""
+        try:
+            return str(string)
+        except UnicodeError:
+            return string.encode(self.encoding, 'replace')
+
+    def lstripwords(
+        self, string, strip=collate.strings.INITIAL_STOPS, append=u", "):
+        """Strip words and whitespace from the start of a string.
+
+        If append is not empty, it and the words stripped from the
+        front are appended to the end.
+        """
+        string = self.unicode(string)
+        stripped = []
+        words = self.words(string)
+        while words and (words[0].isspace() or words[0].lower() in strip):
+            stripped.append(words.pop(0))
+        while stripped and stripped[-1].isspace():
+            stripped.pop()
+        if append and stripped:
+            if words:
+                words.append(append)
+            words.extend(stripped)
+        return u"".join(words)
+
+    def lstripsortemekey(
+        self, string, strip=collate.strings.INITIAL_STOPS, append=u", "):
+        """Return a key based on sortemes of a prefix-stripped string."""
+        string = self.unicode(string)
+        stripped = self.lstripwords(string, strip, append)
+        return (self.sortemekey(stripped), self.key(string))