From d46d035bdc1ef7276af7c41880034226d0cdfbfc Mon Sep 17 00:00:00 2001
From: Joe Wreschnig <joe.wreschnig@gmail.com>
Date: Tue, 16 Feb 2010 00:10:44 -0800
Subject: [PATCH] Calculate sortemes using simply alnum splitting rather than
 word breaks. Faster and slightly more accurate for our purposes. Strip
 punctuation.

---
 collate/_abcollator.py | 13 ++++++-------
 collate/_strings.py    | 33 ++++++++++++++++++++++++---------
 2 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/collate/_abcollator.py b/collate/_abcollator.py
index bc43dc3..02cb733 100644
--- a/collate/_abcollator.py
+++ b/collate/_abcollator.py
@@ -13,12 +13,11 @@ class Collator(object):
         return string.split()
 
     def sortemes(self, string):
-        words = []
-        for word in self.words(string):
-            words.extend(collate._strings.alnumsplit(word))
-        return filter(collate._strings.wordlike, words)
+        return collate._strings.alnumsplit(string)
 
-    def sortemekey(self, string):
-        words = map(collate._strings.numeric, self.sortemes(string))
-        words = [(i, self.key(word)) for (i, word) in words]
+    def sortemekey(self, string, invalid=float('inf')):
+        words = []
+        for sorteme in self.sortemes(string):
+            num, alpha = collate._strings.numeric(sorteme, invalid)
+            words.append((num, self.key(alpha)))
         return words
diff --git a/collate/_strings.py b/collate/_strings.py
index fd18bb9..f81bfd7 100644
--- a/collate/_strings.py
+++ b/collate/_strings.py
@@ -1,11 +1,20 @@
 import unicodedata
 
+def strip_nonalnum(string):
+    while string and not (string[0].isalpha() or string[0].isnumeric()):
+        string = string[1:]
+    while string and not (string[-1].isalpha() or string[-1].isnumeric()):
+        string = string[:-1]
+    return string
+
 def alnumsplit(string):
+    if not string:
+        return []
     string = unicode(string)
     strings = []
-    word = []
     numeric = None
-    for char in string:
+    start = 0
+    for i, char in enumerate(string):
         if numeric is None:
             broke = False
             if char.isnumeric():
@@ -14,16 +23,18 @@ def alnumsplit(string):
                 numeric = False
         elif numeric and char.isalpha():
             broke = True
+            numeric = False
+        elif numeric and char.isspace():
+            broke = True
+            numeric = None
         elif not numeric and char.isnumeric():
             broke = True
+            numeric = True
         if broke:
-            if word:
-                strings.append(u"".join(word))
-                word = []
-            numeric = None
-        word.append(char)
-    if word:
-        strings.append(u"".join(word))
+            strings.append(strip_nonalnum(string[start:i]))
+            start = i
+            broke = False
+    strings.append(strip_nonalnum(string[start:i + 1]))
     return strings
 
 def wordlike(string):
@@ -44,6 +55,7 @@ def wordlike(string):
 def numeric(orig, invalid=float('inf')):
     if not orig:
         return (invalid, '')
+
     string = unicode(orig)
     for c in string:
         if c.isnumeric():
@@ -57,6 +69,9 @@ def numeric(orig, invalid=float('inf')):
             mult = -mult
         string = string[1:]
 
+    if not string[:1].isnumeric():
+        return (invalid, orig)
+
     # Early out if possible.
     try:
         return (float(string) * mult, orig)
-- 
2.30.2