strings: Microoptimizations, saves about 10% of runtime.
[python-collate.git] / collate / strings.py
index 8d6af99..487257f 100644 (file)
@@ -1,6 +1,6 @@
 """String utility functions for collation."""
 
-__all__ = ["sortemes", "numeric", "normalize_number"]
+__all__ = ["sortemes", "numeric", "normalize_number", "deroman"]
 
 import unicodedata
 
@@ -14,12 +14,32 @@ CONTINUE_ON = frozenset([
 
 UNKNOWN, LETTER, NUMBER = range(3)
 
-BREAKER = u"\u2029" # Paragraph break character
+BREAKER = u"\u2028" # Line break character
+HBREAKER = u"\u2029" # Paragraph break character
 INFINITY = float('inf')
 
 KEEP_IN_NUMBERS = u"'.,"
 ALLOWED_IN_NUMBERS = KEEP_IN_NUMBERS + u"_"
 
+ROMAN = {
+    u"i": 1,
+    u"v": 5,
+    u"x": 10,
+    u"l": 50,
+    u"c": 100,
+    u"d": 500,
+    u"m": 1000,
+    u"\u2180": 1000,
+    u"\u2181": 5000,
+    u"\u2182": 10000,
+    u"\u2183": 100,
+    u"\u2184": 100,
+    u"\u2185": 6,
+    u"\u2186": 50,
+    u"\u2187": 50000,
+    u"\u2188": 100000,
+    }
+
 def stripends(word):
     """Strip punctuation and symbols from the ends of a string."""
     while word and unicodedata.category(word[0])[0] in "PS":
@@ -42,25 +62,23 @@ def sortemes(string, key=lambda s: s):
 
     """
 
+    if not string:
+        return []
+
     words = []
     letters = []
     digits = []
-    if not string:
-        return words
+    lappend = letters.append
+    dappend = digits.append
     string = unicode(string)
     categories = map(unicodedata.category, string)
     previous = UNKNOWN
+    wappend = words.append
+    join = u"".join
+    i = 0
 
-    def aletters(letters):
-        """Add a group of letters to the word list."""
-        words.append((INFINITY, stripends(letters)))
-    def adigits(digits):
-        """Add a group of digits to the word list."""
-        words.append((numeric(digits), u''))
-
-    # TODO(jfw): This kind of evolved over time, there's probably a much
-    # faster / more concise way to express it now.
-    for i, (uchar, category) in enumerate(zip(string, categories)):
+    for uchar in string:
+        category = categories[i]
 
         if letters and previous == LETTER and words:
             word = stripends(words.pop()[1].strip()) + BREAKER
@@ -70,55 +88,61 @@ def sortemes(string, key=lambda s: s):
         # Split at the first letter following a number or
         # non-continuing character.
         if category[0] == "L":
-            letters.append(uchar)
+            lappend(uchar)
             if digits:
-                adigits(u"".join(digits).strip())
-                digits = []
+                words.append((numeric(join(digits).strip()), u''))
+                del(digits[:])
                 previous = NUMBER
 
         # Split at the first number following a non-number or
         # non-continuing character.
         elif category[0] == "N":
-            digits.append(uchar)
+            dappend(uchar)
             if letters:
-                aletters(u"".join(letters))
-                letters = []
+                if unicodedata.category(letters[-1])[0] == "L":
+                    lappend(HBREAKER)
+                wappend((INFINITY, stripends(join(letters))))
+                del(letters[:])
                 previous = LETTER
 
         # Only certain punctuation allowed in numbers.
         elif digits and uchar not in ALLOWED_IN_NUMBERS:
-            adigits(u"".join(digits))
-            digits = []
+            words.append((numeric(join(digits)), u''))
+            del(digits[:])
             previous = NUMBER
 
         # Split if we find a non-continuing character ("weird" ones).
-        elif letters and category not in CONTINUE_ON:
+        elif category not in CONTINUE_ON:
             if letters:
-                aletters(u"".join(letters).strip() + BREAKER)
-                letters = []
+                wappend(
+                    (INFINITY,
+                     stripends(join(letters).strip() + BREAKER)))
+                del(letters[:])
                 previous = LETTER
             if digits:
-                adigits(u"".join(digits).strip())
-                digits = []
+                words.append((numeric(join(digits)), u''))
+                del(digits[:])
                 previous = NUMBER
 
         # Split if we find two pieces of punctuation in a row, even
         # if we should otherwise continue.
-        elif i and categories[i-1][0] in "P" and category[0] in "P":
+        elif i and categories[i - 1][0] == category[0] == "P":
             if letters:
-                aletters(u"".join(letters))
-                letters = []
+                wappend((INFINITY, stripends(join(letters))))
+                del(letters[:])
                 previous = LETTER
             if digits:
-                adigits(u"".join(digits))
-                digits = []
+                words.append((numeric(join(digits)), u''))
+                del(digits[:])
                 previous = NUMBER
 
         else:
             if digits:
-                digits.append(uchar)
+                dappend(uchar)
             elif letters:
-                letters.append(uchar)
+                lappend(uchar)
+
+        i += 1
 
     if letters and previous == LETTER and words:
         word = stripends(words.pop()[1].strip()) + BREAKER
@@ -126,11 +150,11 @@ def sortemes(string, key=lambda s: s):
         previous = UNKNOWN
 
     if letters:
-        aletters(u"".join(letters))
+        wappend((INFINITY, stripends(join(letters))))
     if digits:
-        adigits(u"".join(digits))
+        words.append((numeric(join(digits)), u''))
 
-    return [(i, key(w) if w else u'') for i, w in words]
+    return [(i, key(w)) for i, w in words]
 
 def numeric(orig, invalid=INFINITY):
     """Parse a number out of a string.
@@ -151,6 +175,10 @@ def numeric(orig, invalid=INFINITY):
     else:
         return invalid
 
+    for char in string:
+        if u"\u2160" <= char <= u"\u2188":
+            return deroman(string)
+
     mult = 1
     while string[:1] == u"-" or string[:1] == u"+":
         if string[:1] == u"-":
@@ -273,3 +301,20 @@ def normalize_number(string):
         string = string.replace(u".", u"")
 
     return string or "NaN"
+
+def deroman(string):
+    """Turn a Roman numeral into an integer."""
+    string = unicodedata.normalize('NFKD', unicode(string)).lower()
+    previous = 0
+    building = 0
+    for char in reversed(string):
+        try:
+            value = ROMAN[char]
+        except KeyError:
+            continue
+        if value < previous:
+            building -= value
+        else:
+            building += value
+        previous = value
+    return building