X-Git-Url: https://git.korewanetadesu.com/?p=python-collate.git;a=blobdiff_plain;f=collate%2F_abcollator.py;fp=collate%2F_abcollator.py;h=dd7ea1430445548736e8f3d754ab9325210ac117;hp=622766d3b5056c27ca9b9a98ce4609961dd11c1d;hb=308778ae560a3258a55d578b1dd52d030ce4399d;hpb=f854c6958fd98bc3a1709e7aa1ede1ce7f5ab612

diff --git a/collate/_abcollator.py b/collate/_abcollator.py
index 622766d..dd7ea14 100644
--- a/collate/_abcollator.py
+++ b/collate/_abcollator.py
@@ -1,5 +1,7 @@
 """Abstract base collator."""
 
+import re
+
 import collate.strings
 
 class Collator(object):
@@ -27,15 +29,11 @@ class Collator(object):
         only meaningful when compared to other sort keys from the same
         collator.
         """
-        if isinstance(string, str):
-            string = string.decode(self.encoding, 'replace')
-        return string
+        return self.unicode(string)
 
-    def words(self, string):
-        """Split the string along word boundries."""
-        if isinstance(string, str):
-            string = string.decode(self.encoding, 'replace')
-        return string.split()
+    def words(self, string, sep=re.compile(r"(\s+)", re.UNICODE)):
+        """Split the string into separate words."""
+        return re.split(sep, self.unicode(string))
 
     def sortemekey(self, string):
         """Return a key based on sortemes of a string.
@@ -47,11 +45,49 @@ class Collator(object):
         numbers if 'too much' punctuation exists in between, between
         lines.
         """
-        if isinstance(string, str):
-            string = string.decode(self.encoding, 'replace')
-
+        string = self.unicode(string)
         # Shove the sortkeyed original string on the end to resolve
         # ties intelligently.
         return (collate.strings.sortemes(string, self.key),
                 self.key(string))
 
+    def unicode(self, string):
+        """Convert a str to a unicode using the collator encoding."""
+        try:
+            return unicode(string)
+        except UnicodeError:
+            return string.decode(self.encoding, 'replace')
+
+    def str(self, string):
+        """Convert a unicode to a str using the collator encoding."""
+        try:
+            return str(string)
+        except UnicodeError:
+            return string.encode(self.encoding, 'replace')
+
+    def lstripwords(
+        self, string, strip=collate.strings.INITIAL_STOPS, append=u", "):
+        """Strip words and whitespace from the start of a string.
+
+        If append is not empty, it and the words stripped from the
+        front are appended to the end.
+        """
+        string = self.unicode(string)
+        stripped = []
+        words = self.words(string)
+        while words and (words[0].isspace() or words[0].lower() in strip):
+            stripped.append(words.pop(0))
+        while stripped and stripped[-1].isspace():
+            stripped.pop()
+        if append and stripped:
+            if words:
+                words.append(append)
+            words.extend(stripped)
+        return u"".join(words)
+
+    def lstripsortemekey(
+        self, string, strip=collate.strings.INITIAL_STOPS, append=u", "):
+        """Return a key based on sortemes of a prefix-stripped string."""
+        string = self.unicode(string)
+        stripped = self.lstripwords(string, strip, append)
+        return (self.sortemekey(stripped), self.key(string))