strings: Microoptimizations, saves about 10% of runtime.
authorJoe Wreschnig <joe.wreschnig@gmail.com>
Thu, 25 Feb 2010 23:54:14 +0000 (15:54 -0800)
committerJoe Wreschnig <joe.wreschnig@gmail.com>
Thu, 25 Feb 2010 23:54:14 +0000 (15:54 -0800)
collate/strings.py

index e2750d19b40175a35f588a225db614d932cc9559..487257ffe6e8a02dbdde8b9be669a25d1a976ba5 100644 (file)
@@ -62,25 +62,23 @@ def sortemes(string, key=lambda s: s):
 
     """
 
+    if not string:
+        return []
+
     words = []
     letters = []
     digits = []
-    if not string:
-        return words
+    lappend = letters.append
+    dappend = digits.append
     string = unicode(string)
     categories = map(unicodedata.category, string)
     previous = UNKNOWN
+    wappend = words.append
+    join = u"".join
+    i = 0
 
-    def aletters(letters):
-        """Add a group of letters to the word list."""
-        words.append((INFINITY, stripends(letters)))
-    def adigits(digits):
-        """Add a group of digits to the word list."""
-        words.append((numeric(digits), u''))
-
-    # TODO(jfw): This kind of evolved over time, there's probably a much
-    # faster / more concise way to express it now.
-    for i, (uchar, category) in enumerate(zip(string, categories)):
+    for uchar in string:
+        category = categories[i]
 
         if letters and previous == LETTER and words:
             word = stripends(words.pop()[1].strip()) + BREAKER
@@ -90,57 +88,61 @@ def sortemes(string, key=lambda s: s):
         # Split at the first letter following a number or
         # non-continuing character.
         if category[0] == "L":
-            letters.append(uchar)
+            lappend(uchar)
             if digits:
-                adigits(u"".join(digits).strip())
-                digits = []
+                words.append((numeric(join(digits).strip()), u''))
+                del(digits[:])
                 previous = NUMBER
 
         # Split at the first number following a non-number or
         # non-continuing character.
         elif category[0] == "N":
-            digits.append(uchar)
+            dappend(uchar)
             if letters:
                 if unicodedata.category(letters[-1])[0] == "L":
-                    letters.append(HBREAKER)
-                aletters(u"".join(letters))
-                letters = []
+                    lappend(HBREAKER)
+                wappend((INFINITY, stripends(join(letters))))
+                del(letters[:])
                 previous = LETTER
 
         # Only certain punctuation allowed in numbers.
         elif digits and uchar not in ALLOWED_IN_NUMBERS:
-            adigits(u"".join(digits))
-            digits = []
+            words.append((numeric(join(digits)), u''))
+            del(digits[:])
             previous = NUMBER
 
         # Split if we find a non-continuing character ("weird" ones).
-        elif letters and category not in CONTINUE_ON:
+        elif category not in CONTINUE_ON:
             if letters:
-                aletters(u"".join(letters).strip() + BREAKER)
-                letters = []
+                wappend(
+                    (INFINITY,
+                     stripends(join(letters).strip() + BREAKER)))
+                del(letters[:])
                 previous = LETTER
             if digits:
-                adigits(u"".join(digits).strip())
-                digits = []
+                words.append((numeric(join(digits)), u''))
+                del(digits[:])
                 previous = NUMBER
 
         # Split if we find two pieces of punctuation in a row, even
         # if we should otherwise continue.
-        elif i and categories[i-1][0] in "P" and category[0] in "P":
+        elif i and categories[i - 1][0] == category[0] == "P":
             if letters:
-                aletters(u"".join(letters))
-                letters = []
+                wappend((INFINITY, stripends(join(letters))))
+                del(letters[:])
                 previous = LETTER
             if digits:
-                adigits(u"".join(digits))
-                digits = []
+                words.append((numeric(join(digits)), u''))
+                del(digits[:])
                 previous = NUMBER
 
         else:
             if digits:
-                digits.append(uchar)
+                dappend(uchar)
             elif letters:
-                letters.append(uchar)
+                lappend(uchar)
+
+        i += 1
 
     if letters and previous == LETTER and words:
         word = stripends(words.pop()[1].strip()) + BREAKER
@@ -148,11 +150,11 @@ def sortemes(string, key=lambda s: s):
         previous = UNKNOWN
 
     if letters:
-        aletters(u"".join(letters))
+        wappend((INFINITY, stripends(join(letters))))
     if digits:
-        adigits(u"".join(digits))
+        words.append((numeric(join(digits)), u''))
 
-    return [(i, key(w) if w else u'') for i, w in words]
+    return [(i, key(w)) for i, w in words]
 
 def numeric(orig, invalid=INFINITY):
     """Parse a number out of a string.