X-Git-Url: https://git.korewanetadesu.com/?p=python-collate.git;a=blobdiff_plain;f=collate%2Fstrings.py;h=5badc8c66ae9826dc8335a9c8d00e129610caaff;hp=57172461b91eac0585395483e6e3ba658d7e4fc2;hb=576fe7bace3061ef949762141afbb3eb2ec31ecd;hpb=91fd1e4a0bb531462bc443c21001376411ff862d

diff --git a/collate/strings.py b/collate/strings.py
index 5717246..5badc8c 100644
--- a/collate/strings.py
+++ b/collate/strings.py
@@ -11,14 +11,15 @@ CONTINUE_ON = frozenset([
 UNKNOWN, LETTER, NUMBER = range(3)
 
 BREAKER = u"\u2029" # Paragraph break character
+INFINITY = float('inf')
 
-def sortemes(string):
+def sortemes(string, key=lambda s: s):
     """Generate a list of sortemes for the string.
 
     A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
     sort information. This is larger than a word boundry but smaller
     than a sentence boundry; roughly, a sorteme boundry occurs between
-    letters and numbers, between numbers and numbrs if 'too much'
+    letters and numbers, between numbers and numbers if 'too much'
     punctuation exists in between, between lines.
 
     There is no formal specification for sortemes; the goal of this
@@ -33,7 +34,6 @@ def sortemes(string):
     string = unicode(string)
     categories = map(unicodedata.category, string)
     previous = UNKNOWN
-    types = []
 
     def stripends(word):
         while word and unicodedata.category(word[0])[0] in "PS":
@@ -42,13 +42,18 @@ def sortemes(string):
             word = word[:-1]
         return word
 
+    def aletters(letters):
+        words.append((INFINITY, stripends(letters)))
+    def adigits(digits):
+        words.append((numeric(digits), u''))
+
     # TODO(jfw): This kind of evolved over time, there's probably a much
     # faster / more concise way to express it now.
     for i, (c, category) in enumerate(zip(string, categories)):
 
         if letters and previous == LETTER and words:
-            word = stripends(words.pop().strip())
-            letters = list(stripends(word).strip() + BREAKER) + letters
+            word = stripends(words.pop()[1].strip()) + BREAKER
+            letters.insert(0, word)
             previous = UNKNOWN
 
         # Split at the first letter following a number or
@@ -56,47 +61,47 @@ def sortemes(string):
         if category[0] == "L":
             letters.append(c)
             if digits:
-                words.append(u"".join(digits).strip())
-                previous = NUMBER
+                adigits(u"".join(digits).strip())
                 digits = []
+                previous = NUMBER
 
         # Split at the first number following a non-number or
         # non-continuing character.
         elif category[0] == "N":
             digits.append(c)
             if letters:
-                words.append(u"".join(letters))
-                previous = LETTER
+                aletters(u"".join(letters))
                 letters = []
+                previous = LETTER
 
         # Only certain punctuation allowed in numbers.
         elif digits and c not in "',._":
-            words.append(u"".join(digits))
-            previous = NUMBER
+            adigits(u"".join(digits))
             digits = []
+            previous = NUMBER
 
         # Split if we find a non-continuing character ("weird" ones).
         elif letters and category not in CONTINUE_ON:
             if letters:
-                words.append(u"".join(letters).strip() + BREAKER)
-                previous = LETTER
+                aletters(u"".join(letters).strip() + BREAKER)
                 letters = []
+                previous = LETTER
             if digits:
-                words.append(u"".join(digits).strip() + BREAKER)
-                previous = NUMBER
+                adigits(u"".join(digits).strip())
                 digits = []
+                previous = NUMBER
 
         # Split if we find two pieces of punctuation in a row, even
         # if we should otherwise continue.
         elif i and categories[i-1][0] in "P" and category[0] in "P":
             if letters:
-                words.append(u"".join(letters))
-                previous = LETTER
+                aletters(u"".join(letters))
                 letters = []
+                previous = LETTER
             if digits:
-                words.append(u"".join(digits))
-                previous = NUMBER
+                adigits(u"".join(digits))
                 digits = []
+                previous = NUMBER
 
         else:
             if digits:
@@ -105,30 +110,27 @@ def sortemes(string):
                 letters.append(c)
 
     if letters and previous == LETTER and words:
-        word = stripends(words.pop().strip())
-        letters = list(stripends(word).strip() + BREAKER) + letters
+        word = stripends(words.pop()[1].strip()) + BREAKER
+        letters.insert(0, word)
         previous = UNKNOWN
 
     if letters:
-        words.append(u"".join(letters))
-        letters = []
+        aletters(u"".join(letters))
     if digits:
-        words.append(u"".join(digits))
-        digits = []
+        adigits(u"".join(digits))
 
-    words = map(stripends, words)
-    return words
+    return [(i, key(w) if w else u'') for i, w in words]
 
-def numeric(orig, invalid=float('inf')):
+def numeric(orig, invalid=INFINITY):
     if not orig:
-        return (invalid, '')
+        return invalid
 
     string = unicode(orig)
     for c in string:
         if c.isnumeric():
             break
     else:
-        return (invalid, orig)
+        return invalid
 
     mult = 1
     while string[:1] == u"-" or string[:1] == u"+":
@@ -141,12 +143,6 @@ def numeric(orig, invalid=float('inf')):
 
     string = normalize_punc(string)
 
-    # Early out if possible.
-    try:
-        return (float(string) * mult, orig)
-    except ValueError:
-        pass
-
     # Otherwise we need to do this the hard way.
     def _numeric(string):
         total = 0
@@ -161,9 +157,9 @@ def numeric(orig, invalid=float('inf')):
         whole, frac = string.split(".")
         whole = _numeric(whole)
         frac = _numeric(frac) / (10.0 ** len(frac))
-        return (mult * (whole + frac), orig)
+        return mult * (whole + frac)
     except ValueError:
-        return (mult * _numeric(string), orig)
+        return mult * _numeric(string)
 
 def normalize_punc(string):
     string = unicode(string.strip(u",.'"))