From 308778ae560a3258a55d578b1dd52d030ce4399d Mon Sep 17 00:00:00 2001
From: Joe Wreschnig <joe.wreschnig@gmail.com>
Date: Thu, 25 Feb 2010 20:49:40 -0800
Subject: [PATCH] Collator.lstripwords: Strip words off the start and append to
 the end. Collate.unicode, Collate.str: Convenience recoding routines.

---
 NEWS.txt                |  2 ++
 collate/__init__.py     |  2 +-
 collate/_abcollator.py  | 58 +++++++++++++++++++++++++++++++++--------
 collate/icu/__init__.py |  9 +++----
 collate/strings.py      |  2 ++
 collate/syslocale.py    | 12 ++-------
 setup.py                |  2 +-
 7 files changed, 58 insertions(+), 29 deletions(-)

diff --git a/NEWS.txt b/NEWS.txt
index 823498d..c2cbbc0 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,6 +1,8 @@
 2010 - 0.2 - Dedicated to ï¼ (U+FF12)
  - ç´æï½ before ç´æï¼ï½.
  - Detect Unicode Roman numeral codepoints and sort them numerically.
+ - lstripwords and lstripsortemekey methods, e.g. sort "A Perfect Sky"
+   as "Perfect Sky, A".
 
 2010.02.22 - 0.1 - Dedicated to å (U+52D8)
  - Initial release.
diff --git a/collate/__init__.py b/collate/__init__.py
index 5094071..ea1516a 100644
--- a/collate/__init__.py
+++ b/collate/__init__.py
@@ -52,7 +52,7 @@ try:
 except ImportError:
     pass
 
-VERSION = (0, 1)
+VERSION = (0, 2)
 VERSION_STRING = ".".join(map(str, VERSION))
 
 collator = None
diff --git a/collate/_abcollator.py b/collate/_abcollator.py
index 622766d..dd7ea14 100644
--- a/collate/_abcollator.py
+++ b/collate/_abcollator.py
@@ -1,5 +1,7 @@
 """Abstract base collator."""
 
+import re
+
 import collate.strings
 
 class Collator(object):
@@ -27,15 +29,11 @@ class Collator(object):
         only meaningful when compared to other sort keys from the same
         collator.
         """
-        if isinstance(string, str):
-            string = string.decode(self.encoding, 'replace')
-        return string
+        return self.unicode(string)
 
-    def words(self, string):
-        """Split the string along word boundries."""
-        if isinstance(string, str):
-            string = string.decode(self.encoding, 'replace')
-        return string.split()
+    def words(self, string, sep=re.compile(r"(\s+)", re.UNICODE)):
+        """Split the string into separate words."""
+        return re.split(sep, self.unicode(string))
 
     def sortemekey(self, string):
         """Return a key based on sortemes of a string.
@@ -47,11 +45,49 @@ class Collator(object):
         numbers if 'too much' punctuation exists in between, between
         lines.
         """
-        if isinstance(string, str):
-            string = string.decode(self.encoding, 'replace')
-
+        string = self.unicode(string)
         # Shove the sortkeyed original string on the end to resolve
         # ties intelligently.
         return (collate.strings.sortemes(string, self.key),
                 self.key(string))
 
+    def unicode(self, string):
+        """Convert a str to a unicode using the collator encoding."""
+        try:
+            return unicode(string)
+        except UnicodeError:
+            return string.decode(self.encoding, 'replace')
+
+    def str(self, string):
+        """Convert a unicode to a str using the collator encoding."""
+        try:
+            return str(string)
+        except UnicodeError:
+            return string.encode(self.encoding, 'replace')
+
+    def lstripwords(
+        self, string, strip=collate.strings.INITIAL_STOPS, append=u", "):
+        """Strip words and whitespace from the start of a string.
+
+        If append is not empty, it and the words stripped from the
+        front are appended to the end.
+        """
+        string = self.unicode(string)
+        stripped = []
+        words = self.words(string)
+        while words and (words[0].isspace() or words[0].lower() in strip):
+            stripped.append(words.pop(0))
+        while stripped and stripped[-1].isspace():
+            stripped.pop()
+        if append and stripped:
+            if words:
+                words.append(append)
+            words.extend(stripped)
+        return u"".join(words)
+
+    def lstripsortemekey(
+        self, string, strip=collate.strings.INITIAL_STOPS, append=u", "):
+        """Return a key based on sortemes of a prefix-stripped string."""
+        string = self.unicode(string)
+        stripped = self.lstripwords(string, strip, append)
+        return (self.sortemekey(stripped), self.key(string))
diff --git a/collate/icu/__init__.py b/collate/icu/__init__.py
index e7dfe2d..6d27a6f 100644
--- a/collate/icu/__init__.py
+++ b/collate/icu/__init__.py
@@ -41,10 +41,8 @@ class Collator(collate._abcollator.Collator):
 
     def words(self, string):
         """Split the string along word boundries."""
-        if isinstance(string, str):
-            string = string.decode(self.encoding)
-        words = self._breaker.words(string)
-        return [w for w in words if not w.isspace()]
+        string = self.unicode(string)
+        return self._breaker.words(string)
 
     def key(self, string):
         """Sort key for a string.
@@ -53,7 +51,6 @@ class Collator(collate._abcollator.Collator):
         instance according to the 'encoding' attribute of the
         Collator.
         """
-        if isinstance(string, str):
-            string = string.decode(self.encoding, 'replace')
+        string = self.unicode(string)
         return self._collator.key(string)
 
diff --git a/collate/strings.py b/collate/strings.py
index 487257f..16ba8be 100644
--- a/collate/strings.py
+++ b/collate/strings.py
@@ -40,6 +40,8 @@ ROMAN = {
     u"\u2188": 100000,
     }
 
+INITIAL_STOPS = frozenset([u"a", u"an", u"the"])
+
 def stripends(word):
     """Strip punctuation and symbols from the ends of a string."""
     while word and unicodedata.category(word[0])[0] in "PS":
diff --git a/collate/syslocale.py b/collate/syslocale.py
index 5b8adca..e48ee82 100644
--- a/collate/syslocale.py
+++ b/collate/syslocale.py
@@ -25,7 +25,6 @@ Avoid this backend if...
 __all__ = ["Collator"]
 
 import locale
-import re
 
 import collate.errors
 import collate._abcollator
@@ -54,12 +53,5 @@ class Collator(collate._abcollator.Collator):
         """
         try:
             return locale.strxfrm(string)
-        except UnicodeEncodeError:
-            return locale.strxfrm(string.encode(self.encoding, "replace"))
-
-    def words(self, string, sep=re.compile(r"\W+", re.UNICODE)):
-        """Split the string into separate words."""
-        if isinstance(string, str):
-            string = string.decode(self.encoding, 'replace')
-        return re.split(sep, string)
-
+        except UnicodeError:
+            return locale.strxfrm(string.str(self.encoding))
diff --git a/setup.py b/setup.py
index 99a13af..e47d620 100755
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,7 @@ else:
     libraries = ['icui18n', 'icuuc', 'icudata']
 
 setup(name='collate',
-      version='0.1',
+      version='0.2',
       author="Joe Wreschnig",
       author_email="joe.wreschnig@gmail.com",
       description="Python text collation",
-- 
2.20.1