Collator.lstripwords: Strip words off the start and append to the end.

[python-collate.git] / collate / _abcollator.py
diff --git a/collate/_abcollator.py b/collate/_abcollator.py

index 622766d..dd7ea14 100644 (file)
--- a/collate/_abcollator.py
+++ b/collate/_abcollator.py
@@ -1,5 +1,7 @@
  """Abstract base collator."""
  
+import re
+
  import collate.strings
  
  class Collator(object):
@@ -27,15 +29,11 @@ class Collator(object):
          only meaningful when compared to other sort keys from the same
          collator.
          """
-        if isinstance(string, str):
-            string = string.decode(self.encoding, 'replace')
-        return string
+        return self.unicode(string)
  
-    def words(self, string):
-        """Split the string along word boundries."""
-        if isinstance(string, str):
-            string = string.decode(self.encoding, 'replace')
-        return string.split()
+    def words(self, string, sep=re.compile(r"(\s+)", re.UNICODE)):
+        """Split the string into separate words."""
+        return re.split(sep, self.unicode(string))
  
      def sortemekey(self, string):
          """Return a key based on sortemes of a string.
@@ -47,11 +45,49 @@ class Collator(object):
          numbers if 'too much' punctuation exists in between, between
          lines.
          """
-        if isinstance(string, str):
-            string = string.decode(self.encoding, 'replace')
-
+        string = self.unicode(string)
          # Shove the sortkeyed original string on the end to resolve
          # ties intelligently.
          return (collate.strings.sortemes(string, self.key),
                  self.key(string))
  
+    def unicode(self, string):
+        """Convert a str to a unicode using the collator encoding."""
+        try:
+            return unicode(string)
+        except UnicodeError:
+            return string.decode(self.encoding, 'replace')
+
+    def str(self, string):
+        """Convert a unicode to a str using the collator encoding."""
+        try:
+            return str(string)
+        except UnicodeError:
+            return string.encode(self.encoding, 'replace')
+
+    def lstripwords(
+        self, string, strip=collate.strings.INITIAL_STOPS, append=u", "):
+        """Strip words and whitespace from the start of a string.
+
+        If append is not empty, it and the words stripped from the
+        front are appended to the end.
+        """
+        string = self.unicode(string)
+        stripped = []
+        words = self.words(string)
+        while words and (words[0].isspace() or words[0].lower() in strip):
+            stripped.append(words.pop(0))
+        while stripped and stripped[-1].isspace():
+            stripped.pop()
+        if append and stripped:
+            if words:
+                words.append(append)
+            words.extend(stripped)
+        return u"".join(words)
+
+    def lstripsortemekey(
+        self, string, strip=collate.strings.INITIAL_STOPS, append=u", "):
+        """Return a key based on sortemes of a prefix-stripped string."""
+        string = self.unicode(string)
+        stripped = self.lstripwords(string, strip, append)
+        return (self.sortemekey(stripped), self.key(string))