strings: Microoptimizations, saves about 10% of runtime.

[python-collate.git] / collate / strings.py
diff --git a/collate/strings.py b/collate/strings.py

index e2750d1..487257f 100644 (file)
--- a/collate/strings.py
+++ b/collate/strings.py
@@ -62,25 +62,23 @@ def sortemes(string, key=lambda s: s):
  
      """
  
+    if not string:
+        return []
+
      words = []
      letters = []
      digits = []
-    if not string:
-        return words
+    lappend = letters.append
+    dappend = digits.append
      string = unicode(string)
      categories = map(unicodedata.category, string)
      previous = UNKNOWN
+    wappend = words.append
+    join = u"".join
+    i = 0
  
-    def aletters(letters):
-        """Add a group of letters to the word list."""
-        words.append((INFINITY, stripends(letters)))
-    def adigits(digits):
-        """Add a group of digits to the word list."""
-        words.append((numeric(digits), u''))
-
-    # TODO(jfw): This kind of evolved over time, there's probably a much
-    # faster / more concise way to express it now.
-    for i, (uchar, category) in enumerate(zip(string, categories)):
+    for uchar in string:
+        category = categories[i]
  
          if letters and previous == LETTER and words:
              word = stripends(words.pop()[1].strip()) + BREAKER
@@ -90,57 +88,61 @@ def sortemes(string, key=lambda s: s):
          # Split at the first letter following a number or
          # non-continuing character.
          if category[0] == "L":
-            letters.append(uchar)
+            lappend(uchar)
              if digits:
-                adigits(u"".join(digits).strip())
-                digits = []
+                words.append((numeric(join(digits).strip()), u''))
+                del(digits[:])
                  previous = NUMBER
  
          # Split at the first number following a non-number or
          # non-continuing character.
          elif category[0] == "N":
-            digits.append(uchar)
+            dappend(uchar)
              if letters:
                  if unicodedata.category(letters[-1])[0] == "L":
-                    letters.append(HBREAKER)
-                aletters(u"".join(letters))
-                letters = []
+                    lappend(HBREAKER)
+                wappend((INFINITY, stripends(join(letters))))
+                del(letters[:])
                  previous = LETTER
  
          # Only certain punctuation allowed in numbers.
          elif digits and uchar not in ALLOWED_IN_NUMBERS:
-            adigits(u"".join(digits))
-            digits = []
+            words.append((numeric(join(digits)), u''))
+            del(digits[:])
              previous = NUMBER
  
          # Split if we find a non-continuing character ("weird" ones).
-        elif letters and category not in CONTINUE_ON:
+        elif category not in CONTINUE_ON:
              if letters:
-                aletters(u"".join(letters).strip() + BREAKER)
-                letters = []
+                wappend(
+                    (INFINITY,
+                     stripends(join(letters).strip() + BREAKER)))
+                del(letters[:])
                  previous = LETTER
              if digits:
-                adigits(u"".join(digits).strip())
-                digits = []
+                words.append((numeric(join(digits)), u''))
+                del(digits[:])
                  previous = NUMBER
  
          # Split if we find two pieces of punctuation in a row, even
          # if we should otherwise continue.
-        elif i and categories[i-1][0] in "P" and category[0] in "P":
+        elif i and categories[i - 1][0] == category[0] == "P":
              if letters:
-                aletters(u"".join(letters))
-                letters = []
+                wappend((INFINITY, stripends(join(letters))))
+                del(letters[:])
                  previous = LETTER
              if digits:
-                adigits(u"".join(digits))
-                digits = []
+                words.append((numeric(join(digits)), u''))
+                del(digits[:])
                  previous = NUMBER
  
          else:
              if digits:
-                digits.append(uchar)
+                dappend(uchar)
              elif letters:
-                letters.append(uchar)
+                lappend(uchar)
+
+        i += 1
  
      if letters and previous == LETTER and words:
          word = stripends(words.pop()[1].strip()) + BREAKER
@@ -148,11 +150,11 @@ def sortemes(string, key=lambda s: s):
          previous = UNKNOWN
  
      if letters:
-        aletters(u"".join(letters))
+        wappend((INFINITY, stripends(join(letters))))
      if digits:
-        adigits(u"".join(digits))
+        words.append((numeric(join(digits)), u''))
  
-    return [(i, key(w) if w else u'') for i, w in words]
+    return [(i, key(w)) for i, w in words]
  
  def numeric(orig, invalid=INFINITY):
      """Parse a number out of a string.