Calculate sortemes using simply alnum splitting rather than word breaks. Faster and...

author Joe Wreschnig <joe.wreschnig@gmail.com>

Tue, 16 Feb 2010 08:10:44 +0000 (00:10 -0800)

committer Joe Wreschnig <joe.wreschnig@gmail.com>

Tue, 16 Feb 2010 08:10:44 +0000 (00:10 -0800)
author Joe Wreschnig <joe.wreschnig@gmail.com>
Tue, 16 Feb 2010 08:10:44 +0000 (00:10 -0800)
committer Joe Wreschnig <joe.wreschnig@gmail.com>
Tue, 16 Feb 2010 08:10:44 +0000 (00:10 -0800)
diff --git a/collate/_abcollator.py b/collate/_abcollator.py

index bc43dc384d75e926ba1636356f931a44f3b81096..02cb733c79d889462a03e4680ece539e664cb696 100644 (file)
--- a/collate/_abcollator.py
+++ b/collate/_abcollator.py
@@ -13,12 +13,11 @@ class Collator(object):
          return string.split()
  
      def sortemes(self, string):
-        words = []
-        for word in self.words(string):
-            words.extend(collate._strings.alnumsplit(word))
-        return filter(collate._strings.wordlike, words)
+        return collate._strings.alnumsplit(string)
  
-    def sortemekey(self, string):
-        words = map(collate._strings.numeric, self.sortemes(string))
-        words = [(i, self.key(word)) for (i, word) in words]
+    def sortemekey(self, string, invalid=float('inf')):
+        words = []
+        for sorteme in self.sortemes(string):
+            num, alpha = collate._strings.numeric(sorteme, invalid)
+            words.append((num, self.key(alpha)))
          return words
diff --git a/collate/_strings.py b/collate/_strings.py

index fd18bb917008dcb56aa06246dc86d1a721fe601c..f81bfd7e20be5bc65529f85e3bccab5fb0f15d60 100644 (file)
--- a/collate/_strings.py
+++ b/collate/_strings.py
@@ -1,11 +1,20 @@
  import unicodedata
  
+def strip_nonalnum(string):
+    while string and not (string[0].isalpha() or string[0].isnumeric()):
+        string = string[1:]
+    while string and not (string[-1].isalpha() or string[-1].isnumeric()):
+        string = string[:-1]
+    return string
+
  def alnumsplit(string):
+    if not string:
+        return []
      string = unicode(string)
      strings = []
-    word = []
      numeric = None
-    for char in string:
+    start = 0
+    for i, char in enumerate(string):
          if numeric is None:
              broke = False
              if char.isnumeric():
@@ -14,16 +23,18 @@ def alnumsplit(string):
                  numeric = False
          elif numeric and char.isalpha():
              broke = True
+            numeric = False
+        elif numeric and char.isspace():
+            broke = True
+            numeric = None
          elif not numeric and char.isnumeric():
              broke = True
+            numeric = True
          if broke:
-            if word:
-                strings.append(u"".join(word))
-                word = []
-            numeric = None
-        word.append(char)
-    if word:
-        strings.append(u"".join(word))
+            strings.append(strip_nonalnum(string[start:i]))
+            start = i
+            broke = False
+    strings.append(strip_nonalnum(string[start:i + 1]))
      return strings
  
  def wordlike(string):
@@ -44,6 +55,7 @@ def wordlike(string):
  def numeric(orig, invalid=float('inf')):
      if not orig:
          return (invalid, '')
+
      string = unicode(orig)
      for c in string:
          if c.isnumeric():
@@ -57,6 +69,9 @@ def numeric(orig, invalid=float('inf')):
              mult = -mult
          string = string[1:]
  
+    if not string[:1].isnumeric():
+        return (invalid, orig)
+
      # Early out if possible.
      try:
          return (float(string) * mult, orig)
author	Joe Wreschnig <joe.wreschnig@gmail.com>
	Tue, 16 Feb 2010 08:10:44 +0000 (00:10 -0800)
committer	Joe Wreschnig <joe.wreschnig@gmail.com>
	Tue, 16 Feb 2010 08:10:44 +0000 (00:10 -0800)
collate/_abcollator.py		patch \| blob \| history
collate/_strings.py		patch \| blob \| history