'Advanced' sorteme functions.

author Joe Wreschnig <joe.wreschnig@gmail.com>

Tue, 16 Feb 2010 04:49:57 +0000 (20:49 -0800)

committer Joe Wreschnig <joe.wreschnig@gmail.com>

Tue, 16 Feb 2010 04:49:57 +0000 (20:49 -0800)
author Joe Wreschnig <joe.wreschnig@gmail.com>
Tue, 16 Feb 2010 04:49:57 +0000 (20:49 -0800)
committer Joe Wreschnig <joe.wreschnig@gmail.com>
Tue, 16 Feb 2010 04:49:57 +0000 (20:49 -0800)
diff --git a/collate/_abcollator.py b/collate/_abcollator.py

index a6ec268..bc43dc3 100644 (file)
--- a/collate/_abcollator.py
+++ b/collate/_abcollator.py
@@ -1,3 +1,5 @@
+import collate._strings
+
  class Collator(object):
      def cmp(self, string1, string2):
          """Return negative if a < b, zero if a == b, positive if a > b."""
@@ -10,5 +12,13 @@ class Collator(object):
          """
          return string.split()
  
-    def wordkeys(self, string):
-        return map(self.key, self.words)
+    def sortemes(self, string):
+        words = []
+        for word in self.words(string):
+            words.extend(collate._strings.alnumsplit(word))
+        return filter(collate._strings.wordlike, words)
+
+    def sortemekey(self, string):
+        words = map(collate._strings.numeric, self.sortemes(string))
+        words = [(i, self.key(word)) for (i, word) in words]
+        return words
diff --git a/collate/_strings.py b/collate/_strings.py

index 6bcfd9c..fd18bb9 100644 (file)
--- a/collate/_strings.py
+++ b/collate/_strings.py
@@ -1,3 +1,5 @@
+import unicodedata
+
  def alnumsplit(string):
      string = unicode(string)
      strings = []
@@ -16,12 +18,12 @@ def alnumsplit(string):
              broke = True
          if broke:
              if word:
-                strings.append("".join(word))
+                strings.append(u"".join(word))
                  word = []
              numeric = None
          word.append(char)
      if word:
-        strings.append("".join(word))
+        strings.append(u"".join(word))
      return strings
  
  def wordlike(string):
@@ -29,56 +31,84 @@ def wordlike(string):
  
      Word-like strings contain at least one alphanumeric character.
      """
-    return any(map(type(string).isalnum, string))
  
-def numeric(string, invalid=float('inf')):
-    string = unicode(string)
-    if not any(map(type(string).isnumeric, string)):
-        return (invalid, string)
-    if not string:
+    # Explicit loop is faster than:
+    #return any(map(type(string).isalnum, string))
+
+    for c in string:
+        if c.isalnum():
+            return True
+    else:
+        return False
+
+def numeric(orig, invalid=float('inf')):
+    if not orig:
          return (invalid, '')
+    string = unicode(orig)
+    for c in string:
+        if c.isnumeric():
+            break
+    else:
+        return (invalid, orig)
  
      mult = 1
-    while string[:1] == "-" or string[:1] == "+":
-        if string[0] == "-":
+    while string[:1] == u"-" or string[:1] == u"+":
+        if string[:1] == u"-":
              mult = -mult
          string = string[1:]
  
-    # Maybe we got lucky and this is a trivial case...
+    # Early out if possible.
      try:
-        return float(string) * mult
+        return (float(string) * mult, orig)
      except ValueError:
          pass
  
      # Otherwise we need to do this the hard way.
-    return mult * float(normalize_dots(string))
+    string = normalize_punc(string)
+
+    def _numeric(string):
+        total = 0
+        for c in string:
+            v = unicodedata.numeric(c)
+            if v >= 1:
+                total *= 10
+            total += v
+        return total
+
+    try:
+        whole, frac = string.split(".")
+        whole = _numeric(whole)
+        frac = _numeric(frac) / (10.0 ** len(frac))
+        return (mult * (whole + frac), orig)
+    except ValueError:
+        return (mult * _numeric(string), orig)
  
-def normalize_dots(string):
-    string = unicode(string.strip(",.'"))
-    string = filter(lambda u: u.isnumeric() or u in ",.'", string)
-    commas = string.count(",")
-    stops = string.count(".")
-    quotes = string.count("'")
+def normalize_punc(string):
+    string = unicode(string.strip(u",.'"))
+    string = filter(lambda u: u.isnumeric() or u in u",.'", string)
+    commas = string.count(u",")
+    stops = string.count(u".")
+    quotes = string.count(u"'")
  
      # If anything occurs more than once, it's a separator.
      if commas > 1:
-        string = string.replace(",", "")
+        string = string.replace(u",", u"")
          commas = 0
      if stops > 1:
-        string = string.replace(".", "")
+        string = string.replace(u".", u"")
          stops = 0
      if quotes > 1:
-        string = string.replace("'", "")
+        string = string.replace(u"'", u"")
          quotes = 0
  
-    def normalize_two(a, b):
+    def normalize_two(a, b, string):
          # One of each - assume the first is grouping, second is point.
          a_idx = string.rindex(a)
          b_idx = string.rindex(b)
          if a_idx > b_idx:
-            string = string.replace(b, "").replace(a, ".")
+            string = string.replace(b, u"").replace(a, u".")
          else:
-            string = string.replace(a, "").replace(b, ".")
+            string = string.replace(a, u"").replace(b, u".")
          return string
  
      if commas and stops and quotes:
@@ -90,37 +120,43 @@ def normalize_dots(string):
          # Not really valid, so do whatever we want...
          # A'AAA.BB,CC
          # A'AAA,BB.CC
-        comma_idx = string.index(",")
-        stops_idx = string.index(".")
-        quotes_idx = string.index("'")
+        comma_idx = string.index(u",")
+        stops_idx = string.index(u".")
+        quotes_idx = string.index(u"'")
          if (comma_idx < stops_idx < quotes_idx
              or quotes_idx < stops_idx < comma_idx):
-            string = string.replace(",", "").replace("'", "")
+            string = string.replace(u",", u"").replace(u"'", u"")
          elif (comma_idx < quotes_idx < stops_idx
              or stops_idx < quotes_idx < comma_idx):
-            string = string.replace(",", "").replace(".", "").replace("'", ".")
+            string = string.replace(
+                u",", u"").replace(
+                u".", u"").replace(
+                u"'", u".")
          else:
-            string = string.replace("'", "").replace(".", "").replace(",", ".")
+            string = string.replace(
+                u"'", u"").replace(
+                u".", u"").replace(
+                u",", u".")
  
      elif stops and quotes:
-        string = normalize_two('.', "'")
+        string = normalize_two(u".", u"'", string)
  
      elif commas and quotes:
-        string = normalize_two(',', "'")
+        string = normalize_two(u",", u"'", string)
  
      elif commas and stops:
-        string = normalize_two(',', '.')
+        string = normalize_two(u",", u".", string)
  
      elif commas:
-        if string[-4:-3] == "," and len(string) <= 7:
+        if string[-4:-3] == u"," and len(string) <= 7:
              # Single comma as a thousands separator.
-            string = string.replace(",", "")
+            string = string.replace(u",", u"")
          else:
              # Single comma, not thousands - probably a decimal point.
-            string = string.replace(",", ".")
+            string = string.replace(u",", u".")
  
      elif quotes:
          # Single quote, probably MM'SS", equivalent to a decimal point.
-        string = string.replace("'", ".")
+        string = string.replace(u"'", u".")
  
      return string
diff --git a/pycollate b/pycollate

index ec7f9f9..e38e91d 100755 (executable)
--- a/pycollate
+++ b/pycollate
@@ -54,7 +54,7 @@ def main(argv):
             line = line.strip()
             line = line.decode(encoding, "replace")
             lines.append(line)
-    lines.sort(key=collate.key)
+    lines.sort(key=collate.collator.sortemekey)
  
      for line in lines:
         print line.encode(encoding, "replace")
author	Joe Wreschnig <joe.wreschnig@gmail.com>
	Tue, 16 Feb 2010 04:49:57 +0000 (20:49 -0800)
committer	Joe Wreschnig <joe.wreschnig@gmail.com>
	Tue, 16 Feb 2010 04:49:57 +0000 (20:49 -0800)
collate/_abcollator.py		patch \| blob \| history
collate/_strings.py		patch \| blob \| history
pycollate		patch \| blob \| history