'Advanced' sorteme functions.
authorJoe Wreschnig <joe.wreschnig@gmail.com>
Tue, 16 Feb 2010 04:49:57 +0000 (20:49 -0800)
committerJoe Wreschnig <joe.wreschnig@gmail.com>
Tue, 16 Feb 2010 04:49:57 +0000 (20:49 -0800)
collate/_abcollator.py
collate/_strings.py
pycollate

index a6ec268..bc43dc3 100644 (file)
@@ -1,3 +1,5 @@
+import collate._strings
+
 class Collator(object):
     def cmp(self, string1, string2):
         """Return negative if a < b, zero if a == b, positive if a > b."""
@@ -10,5 +12,13 @@ class Collator(object):
         """
         return string.split()
 
-    def wordkeys(self, string):
-        return map(self.key, self.words)
+    def sortemes(self, string):
+        words = []
+        for word in self.words(string):
+            words.extend(collate._strings.alnumsplit(word))
+        return filter(collate._strings.wordlike, words)
+
+    def sortemekey(self, string):
+        words = map(collate._strings.numeric, self.sortemes(string))
+        words = [(i, self.key(word)) for (i, word) in words]
+        return words
index 6bcfd9c..fd18bb9 100644 (file)
@@ -1,3 +1,5 @@
+import unicodedata
+
 def alnumsplit(string):
     string = unicode(string)
     strings = []
@@ -16,12 +18,12 @@ def alnumsplit(string):
             broke = True
         if broke:
             if word:
-                strings.append("".join(word))
+                strings.append(u"".join(word))
                 word = []
             numeric = None
         word.append(char)
     if word:
-        strings.append("".join(word))
+        strings.append(u"".join(word))
     return strings
 
 def wordlike(string):
@@ -29,56 +31,84 @@ def wordlike(string):
 
     Word-like strings contain at least one alphanumeric character.
     """
-    return any(map(type(string).isalnum, string))
 
-def numeric(string, invalid=float('inf')):
-    string = unicode(string)
-    if not any(map(type(string).isnumeric, string)):
-        return (invalid, string)
-    if not string:
+    # Explicit loop is faster than:
+    #return any(map(type(string).isalnum, string))
+
+    for c in string:
+        if c.isalnum():
+            return True
+    else:
+        return False
+
+def numeric(orig, invalid=float('inf')):
+    if not orig:
         return (invalid, '')
+    string = unicode(orig)
+    for c in string:
+        if c.isnumeric():
+            break
+    else:
+        return (invalid, orig)
 
     mult = 1
-    while string[:1] == "-" or string[:1] == "+":
-        if string[0] == "-":
+    while string[:1] == u"-" or string[:1] == u"+":
+        if string[:1] == u"-":
             mult = -mult
         string = string[1:]
 
-    # Maybe we got lucky and this is a trivial case...
+    # Early out if possible.
     try:
-        return float(string) * mult
+        return (float(string) * mult, orig)
     except ValueError:
         pass
 
     # Otherwise we need to do this the hard way.
-    return mult * float(normalize_dots(string))
+    string = normalize_punc(string)
+
+    def _numeric(string):
+        total = 0
+        for c in string:
+            v = unicodedata.numeric(c)
+            if v >= 1:
+                total *= 10
+            total += v
+        return total
+
+    try:
+        whole, frac = string.split(".")
+        whole = _numeric(whole)
+        frac = _numeric(frac) / (10.0 ** len(frac))
+        return (mult * (whole + frac), orig)
+    except ValueError:
+        return (mult * _numeric(string), orig)
 
-def normalize_dots(string):
-    string = unicode(string.strip(",.'"))
-    string = filter(lambda u: u.isnumeric() or u in ",.'", string)
-    commas = string.count(",")
-    stops = string.count(".")
-    quotes = string.count("'")
+def normalize_punc(string):
+    string = unicode(string.strip(u",.'"))
+    string = filter(lambda u: u.isnumeric() or u in u",.'", string)
+    commas = string.count(u",")
+    stops = string.count(u".")
+    quotes = string.count(u"'")
 
     # If anything occurs more than once, it's a separator.
     if commas > 1:
-        string = string.replace(",", "")
+        string = string.replace(u",", u"")
         commas = 0
     if stops > 1:
-        string = string.replace(".", "")
+        string = string.replace(u".", u"")
         stops = 0
     if quotes > 1:
-        string = string.replace("'", "")
+        string = string.replace(u"'", u"")
         quotes = 0
 
-    def normalize_two(a, b):
+    def normalize_two(a, b, string):
         # One of each - assume the first is grouping, second is point.
         a_idx = string.rindex(a)
         b_idx = string.rindex(b)
         if a_idx > b_idx:
-            string = string.replace(b, "").replace(a, ".")
+            string = string.replace(b, u"").replace(a, u".")
         else:
-            string = string.replace(a, "").replace(b, ".")
+            string = string.replace(a, u"").replace(b, u".")
         return string
 
     if commas and stops and quotes:
@@ -90,37 +120,43 @@ def normalize_dots(string):
         # Not really valid, so do whatever we want...
         # A'AAA.BB,CC
         # A'AAA,BB.CC
-        comma_idx = string.index(",")
-        stops_idx = string.index(".")
-        quotes_idx = string.index("'")
+        comma_idx = string.index(u",")
+        stops_idx = string.index(u".")
+        quotes_idx = string.index(u"'")
         if (comma_idx < stops_idx < quotes_idx
             or quotes_idx < stops_idx < comma_idx):
-            string = string.replace(",", "").replace("'", "")
+            string = string.replace(u",", u"").replace(u"'", u"")
         elif (comma_idx < quotes_idx < stops_idx
             or stops_idx < quotes_idx < comma_idx):
-            string = string.replace(",", "").replace(".", "").replace("'", ".")
+            string = string.replace(
+                u",", u"").replace(
+                u".", u"").replace(
+                u"'", u".")
         else:
-            string = string.replace("'", "").replace(".", "").replace(",", ".")
+            string = string.replace(
+                u"'", u"").replace(
+                u".", u"").replace(
+                u",", u".")
 
     elif stops and quotes:
-        string = normalize_two('.', "'")
+        string = normalize_two(u".", u"'", string)
 
     elif commas and quotes:
-        string = normalize_two(',', "'")
+        string = normalize_two(u",", u"'", string)
 
     elif commas and stops:
-        string = normalize_two(',', '.')
+        string = normalize_two(u",", u".", string)
 
     elif commas:
-        if string[-4:-3] == "," and len(string) <= 7:
+        if string[-4:-3] == u"," and len(string) <= 7:
             # Single comma as a thousands separator.
-            string = string.replace(",", "")
+            string = string.replace(u",", u"")
         else:
             # Single comma, not thousands - probably a decimal point.
-            string = string.replace(",", ".")
+            string = string.replace(u",", u".")
 
     elif quotes:
         # Single quote, probably MM'SS", equivalent to a decimal point.
-        string = string.replace("'", ".")
+        string = string.replace(u"'", u".")
 
     return string
index ec7f9f9..e38e91d 100755 (executable)
--- a/pycollate
+++ b/pycollate
@@ -54,7 +54,7 @@ def main(argv):
            line = line.strip()
            line = line.decode(encoding, "replace")
            lines.append(line)
-    lines.sort(key=collate.key)
+    lines.sort(key=collate.collator.sortemekey)
 
     for line in lines:
        print line.encode(encoding, "replace")