X-Git-Url: https://git.korewanetadesu.com/?p=python-collate.git;a=blobdiff_plain;f=collate%2F_strings.py;fp=collate%2F_strings.py;h=fd18bb917008dcb56aa06246dc86d1a721fe601c;hp=6bcfd9c5393c824e12edf84257c92866b9c48555;hb=53e1676b8d68cccd2b0692654d3871e44e0ba6b6;hpb=96cd5d3ad9dd1390c7739a6c3b9fa03ac3a2b4ff

diff --git a/collate/_strings.py b/collate/_strings.py
index 6bcfd9c..fd18bb9 100644
--- a/collate/_strings.py
+++ b/collate/_strings.py
@@ -1,3 +1,5 @@
+import unicodedata
+
 def alnumsplit(string):
     string = unicode(string)
     strings = []
@@ -16,12 +18,12 @@ def alnumsplit(string):
             broke = True
         if broke:
             if word:
-                strings.append("".join(word))
+                strings.append(u"".join(word))
                 word = []
             numeric = None
         word.append(char)
     if word:
-        strings.append("".join(word))
+        strings.append(u"".join(word))
     return strings
 
 def wordlike(string):
@@ -29,56 +31,84 @@ def wordlike(string):
 
     Word-like strings contain at least one alphanumeric character.
     """
-    return any(map(type(string).isalnum, string))
 
-def numeric(string, invalid=float('inf')):
-    string = unicode(string)
-    if not any(map(type(string).isnumeric, string)):
-        return (invalid, string)
-    if not string:
+    # Explicit loop is faster than:
+    #return any(map(type(string).isalnum, string))
+
+    for c in string:
+        if c.isalnum():
+            return True
+    else:
+        return False
+
+def numeric(orig, invalid=float('inf')):
+    if not orig:
         return (invalid, '')
+    string = unicode(orig)
+    for c in string:
+        if c.isnumeric():
+            break
+    else:
+        return (invalid, orig)
 
     mult = 1
-    while string[:1] == "-" or string[:1] == "+":
-        if string[0] == "-":
+    while string[:1] == u"-" or string[:1] == u"+":
+        if string[:1] == u"-":
             mult = -mult
         string = string[1:]
 
-    # Maybe we got lucky and this is a trivial case...
+    # Early out if possible.
     try:
-        return float(string) * mult
+        return (float(string) * mult, orig)
     except ValueError:
         pass
 
     # Otherwise we need to do this the hard way.
-    return mult * float(normalize_dots(string))
+    string = normalize_punc(string)
+
+    def _numeric(string):
+        total = 0
+        for c in string:
+            v = unicodedata.numeric(c)
+            if v >= 1:
+                total *= 10
+            total += v
+        return total
+
+    try:
+        whole, frac = string.split(".")
+        whole = _numeric(whole)
+        frac = _numeric(frac) / (10.0 ** len(frac))
+        return (mult * (whole + frac), orig)
+    except ValueError:
+        return (mult * _numeric(string), orig)
 
-def normalize_dots(string):
-    string = unicode(string.strip(",.'"))
-    string = filter(lambda u: u.isnumeric() or u in ",.'", string)
-    commas = string.count(",")
-    stops = string.count(".")
-    quotes = string.count("'")
+def normalize_punc(string):
+    string = unicode(string.strip(u",.'"))
+    string = filter(lambda u: u.isnumeric() or u in u",.'", string)
+    commas = string.count(u",")
+    stops = string.count(u".")
+    quotes = string.count(u"'")
 
     # If anything occurs more than once, it's a separator.
     if commas > 1:
-        string = string.replace(",", "")
+        string = string.replace(u",", u"")
         commas = 0
     if stops > 1:
-        string = string.replace(".", "")
+        string = string.replace(u".", u"")
         stops = 0
     if quotes > 1:
-        string = string.replace("'", "")
+        string = string.replace(u"'", u"")
         quotes = 0
 
-    def normalize_two(a, b):
+    def normalize_two(a, b, string):
         # One of each - assume the first is grouping, second is point.
         a_idx = string.rindex(a)
         b_idx = string.rindex(b)
         if a_idx > b_idx:
-            string = string.replace(b, "").replace(a, ".")
+            string = string.replace(b, u"").replace(a, u".")
         else:
-            string = string.replace(a, "").replace(b, ".")
+            string = string.replace(a, u"").replace(b, u".")
         return string
 
     if commas and stops and quotes:
@@ -90,37 +120,43 @@ def normalize_dots(string):
         # Not really valid, so do whatever we want...
         # A'AAA.BB,CC
         # A'AAA,BB.CC
-        comma_idx = string.index(",")
-        stops_idx = string.index(".")
-        quotes_idx = string.index("'")
+        comma_idx = string.index(u",")
+        stops_idx = string.index(u".")
+        quotes_idx = string.index(u"'")
         if (comma_idx < stops_idx < quotes_idx
             or quotes_idx < stops_idx < comma_idx):
-            string = string.replace(",", "").replace("'", "")
+            string = string.replace(u",", u"").replace(u"'", u"")
         elif (comma_idx < quotes_idx < stops_idx
             or stops_idx < quotes_idx < comma_idx):
-            string = string.replace(",", "").replace(".", "").replace("'", ".")
+            string = string.replace(
+                u",", u"").replace(
+                u".", u"").replace(
+                u"'", u".")
         else:
-            string = string.replace("'", "").replace(".", "").replace(",", ".")
+            string = string.replace(
+                u"'", u"").replace(
+                u".", u"").replace(
+                u",", u".")
 
     elif stops and quotes:
-        string = normalize_two('.', "'")
+        string = normalize_two(u".", u"'", string)
 
     elif commas and quotes:
-        string = normalize_two(',', "'")
+        string = normalize_two(u",", u"'", string)
 
     elif commas and stops:
-        string = normalize_two(',', '.')
+        string = normalize_two(u",", u".", string)
 
     elif commas:
-        if string[-4:-3] == "," and len(string) <= 7:
+        if string[-4:-3] == u"," and len(string) <= 7:
             # Single comma as a thousands separator.
-            string = string.replace(",", "")
+            string = string.replace(u",", u"")
         else:
             # Single comma, not thousands - probably a decimal point.
-            string = string.replace(",", ".")
+            string = string.replace(u",", u".")
 
     elif quotes:
         # Single quote, probably MM'SS", equivalent to a decimal point.
-        string = string.replace("'", ".")
+        string = string.replace(u"'", u".")
 
     return string