From 9a7cf6459c40d53b58634f2df56386bf52c12f7c Mon Sep 17 00:00:00 2001
From: Joe Wreschnig <joe.wreschnig@gmail.com>
Date: Fri, 19 Feb 2010 01:23:57 -0800
Subject: [PATCH] More tweaks; notably try to insert paragraph breaks rather
 than a separate Python tuple when re-concatenating strings.

---
 collate/_abcollator.py              | 20 +++++++++++++++++---
 collate/icu/__init__.py             |  7 +++++++
 collate/{_strings.py => strings.py} | 14 +++++++++-----
 3 files changed, 33 insertions(+), 8 deletions(-)
 rename collate/{_strings.py => strings.py} (95%)

diff --git a/collate/_abcollator.py b/collate/_abcollator.py
index fdd7783..0ae5d45 100644
--- a/collate/_abcollator.py
+++ b/collate/_abcollator.py
@@ -1,14 +1,28 @@
-import collate._strings
+import collate.strings
 
 class Collator(object):
     def cmp(self, string1, string2):
         """Return negative if a < b, zero if a == b, positive if a > b."""
         return cmp(self.key(string1), self.key(string2))
 
+    def words(self, string):
+        """Split the string along word boundries."""
+        if isinstance(string, str):
+            string = string.decode(self.encoding, 'replace')
+        return string.split()
+
     def sortemekey(self, string, invalid=float('inf')):
+        """Return a key based on sortemes of a string.
+
+        If the string is a str instance, it is decoded to a unicode
+        instance according to the 'encoding' attribute of the
+        Collator.
+        """
         keys = []
-        for sorteme in collate._strings.sortemes(string):
-            num, alpha = collate._strings.numeric(sorteme, invalid)
+        if isinstance(string, str):
+            string = string.decode(self.encoding, 'replace')
+        for sorteme in collate.strings.sortemes(string):
+            num, alpha = collate.strings.numeric(sorteme, invalid)
             if num == invalid:
                 keys.append(self.key(alpha))
             else:
diff --git a/collate/icu/__init__.py b/collate/icu/__init__.py
index 892b8a1..5f3ec05 100644
--- a/collate/icu/__init__.py
+++ b/collate/icu/__init__.py
@@ -36,6 +36,13 @@ class Collator(collate._abcollator.Collator):
             # so this is a harmless error.
             self._breaker = _icu.WordBreaker("root")
 
+    def words(self, string):
+        """Split the string along word boundries."""
+        if isinstance(string, str):
+            string = string.decode(self.encoding)
+        words = self._breaker.words(string)
+        return [w for w in words if not w.isspace()]
+
     def key(self, string):
         """Sort key for a string.
 
diff --git a/collate/_strings.py b/collate/strings.py
similarity index 95%
rename from collate/_strings.py
rename to collate/strings.py
index aed2ba7..bc4ed62 100644
--- a/collate/_strings.py
+++ b/collate/strings.py
@@ -10,6 +10,8 @@ CONTINUE_ON = frozenset([
 
 UNKNOWN, LETTER, NUMBER = range(3)
 
+BREAKER = u"\u2029"
+
 def sortemes(string):
     """Generate a list of sortemes for the string.
 
@@ -32,6 +34,9 @@ def sortemes(string):
     mode = UNKNOWN
     previous_mode = UNKNOWN
     category = "XX"
+
+    # TODO(jfw): This kind of evolved over time, there's probably a much
+    # faster / more concise way to express it now.
     for i, c in enumerate(string):
         broke = False
         prev_category = category
@@ -72,14 +77,13 @@ def sortemes(string):
             # If we read two strings separated by weird punctuation,
             # pretend the punctuation isn't there.
             if (this_mode == previous_mode == LETTER
-                and (category[0] == "P" or prev_category[0] == "P")
                 and words):
-                words[-1] += u" " + string[start:last+1]
+                words[-1] += BREAKER + string[start:last+1]
             else:
                 # This ensures "foo2 bar" sorts as ["foo ", 2, "bar"]
                 # Which sorts after ["foo", "bar"].
                 if this_mode == NUMBER and previous_mode == LETTER and words:
-                    words[-1] += u" "
+                    words[-1] += BREAKER
                 words.append(string[start:last+1])
                 previous_mode = this_mode
 
@@ -91,10 +95,10 @@ def sortemes(string):
     this_mode = mode
     if start is not None and last is not None:
         if this_mode == LETTER and previous_mode == LETTER and words:
-            words[-1] += u" " + string[start:last+1]
+            words[-1] += BREAKER + string[start:last+1]
         else:
             if this_mode == NUMBER and previous_mode == LETTER and words:
-                words[-1] += u" "
+                words[-1] += BREAKER
             words.append(string[start:last+1])
     return words
 
-- 
2.30.2