New approach - find split points based on Unicode categories.

[python-collate.git] / collate / _abcollator.py
diff --git a/collate/_abcollator.py b/collate/_abcollator.py

index 2e6ab95..fdd7783 100644 (file)
--- a/collate/_abcollator.py
+++ b/collate/_abcollator.py
@@ -5,20 +5,14 @@ class Collator(object):
          """Return negative if a < b, zero if a == b, positive if a > b."""
          return cmp(self.key(string1), self.key(string2))
  
-    def words(self, string):
-        """Split the string into separate words.
-
-        This split is done using Unicode's definition of whitespace.
-        """
-        return string.split()
-
-    def sortemes(self, string):
-        return collate._strings.alnumsplit(string)
-
      def sortemekey(self, string, invalid=float('inf')):
-        words = []
-        for sorteme in self.sortemes(string):
+        keys = []
+        for sorteme in collate._strings.sortemes(string):
              num, alpha = collate._strings.numeric(sorteme, invalid)
-            alpha = self.key(collate._strings.strip_punc(alpha))
-            words.append((num, alpha))
-        return words
+            if num == invalid:
+                keys.append(self.key(alpha))
+            else:
+                keys.append(num)
+        # Shove the sortkeyed original string on the end to resolve
+        # ties intelligently.
+        return (keys, self.key(string))