_strings: Numeric string extraction routines.

author Joe Wreschnig <joe.wreschnig@gmail.com>

Tue, 16 Feb 2010 01:03:52 +0000 (17:03 -0800)

committer Joe Wreschnig <joe.wreschnig@gmail.com>

Tue, 16 Feb 2010 01:03:52 +0000 (17:03 -0800)
author Joe Wreschnig <joe.wreschnig@gmail.com>
Tue, 16 Feb 2010 01:03:52 +0000 (17:03 -0800)
committer Joe Wreschnig <joe.wreschnig@gmail.com>
Tue, 16 Feb 2010 01:03:52 +0000 (17:03 -0800)
diff --git a/collate/_abcollator.py b/collate/_abcollator.py

index 99866c36cac82698313be7c3237ddc3052a6843c..a6ec268d9bb6512bffd6b1d752088798fde6ea09 100644 (file)
--- a/collate/_abcollator.py
+++ b/collate/_abcollator.py
@@ -9,3 +9,6 @@ class Collator(object):
          This split is done using Unicode's definition of whitespace.
          """
          return string.split()
+
+    def wordkeys(self, string):
+        return map(self.key, self.words)
diff --git a/collate/_strings.py b/collate/_strings.py

new file mode 100644 (file)

index 0000000..6bcfd9c
--- /dev/null
+++ b/collate/_strings.py
@@ -0,0 +1,126 @@
+def alnumsplit(string):
+    string = unicode(string)
+    strings = []
+    word = []
+    numeric = None
+    for char in string:
+        if numeric is None:
+            broke = False
+            if char.isnumeric():
+                numeric = True
+            elif char.isalpha():
+                numeric = False
+        elif numeric and char.isalpha():
+            broke = True
+        elif not numeric and char.isnumeric():
+            broke = True
+        if broke:
+            if word:
+                strings.append("".join(word))
+                word = []
+            numeric = None
+        word.append(char)
+    if word:
+        strings.append("".join(word))
+    return strings
+
+def wordlike(string):
+    """Check if a string is 'word-like'.
+
+    Word-like strings contain at least one alphanumeric character.
+    """
+    return any(map(type(string).isalnum, string))
+
+def numeric(string, invalid=float('inf')):
+    string = unicode(string)
+    if not any(map(type(string).isnumeric, string)):
+        return (invalid, string)
+    if not string:
+        return (invalid, '')
+
+    mult = 1
+    while string[:1] == "-" or string[:1] == "+":
+        if string[0] == "-":
+            mult = -mult
+        string = string[1:]
+
+    # Maybe we got lucky and this is a trivial case...
+    try:
+        return float(string) * mult
+    except ValueError:
+        pass
+
+    # Otherwise we need to do this the hard way.
+    return mult * float(normalize_dots(string))
+
+def normalize_dots(string):
+    string = unicode(string.strip(",.'"))
+    string = filter(lambda u: u.isnumeric() or u in ",.'", string)
+    commas = string.count(",")
+    stops = string.count(".")
+    quotes = string.count("'")
+
+    # If anything occurs more than once, it's a separator.
+    if commas > 1:
+        string = string.replace(",", "")
+        commas = 0
+    if stops > 1:
+        string = string.replace(".", "")
+        stops = 0
+    if quotes > 1:
+        string = string.replace("'", "")
+        quotes = 0
+
+    def normalize_two(a, b):
+        # One of each - assume the first is grouping, second is point.
+        a_idx = string.rindex(a)
+        b_idx = string.rindex(b)
+        if a_idx > b_idx:
+            string = string.replace(b, "").replace(a, ".")
+        else:
+            string = string.replace(a, "").replace(b, ".")
+        return string
+
+    if commas and stops and quotes:
+        # If all three, assume the middle is the decimal point.
+        # A,AAA.BB'CC
+        # A.AAA,BB'CC
+        # A,AAA'BB.CC
+        # A.AAA'BB,CC
+        # Not really valid, so do whatever we want...
+        # A'AAA.BB,CC
+        # A'AAA,BB.CC
+        comma_idx = string.index(",")
+        stops_idx = string.index(".")
+        quotes_idx = string.index("'")
+        if (comma_idx < stops_idx < quotes_idx
+            or quotes_idx < stops_idx < comma_idx):
+            string = string.replace(",", "").replace("'", "")
+        elif (comma_idx < quotes_idx < stops_idx
+            or stops_idx < quotes_idx < comma_idx):
+            string = string.replace(",", "").replace(".", "").replace("'", ".")
+        else:
+            string = string.replace("'", "").replace(".", "").replace(",", ".")
+
+    elif stops and quotes:
+        string = normalize_two('.', "'")
+
+    elif commas and quotes:
+        string = normalize_two(',', "'")
+
+    elif commas and stops:
+        string = normalize_two(',', '.')
+
+    elif commas:
+        if string[-4:-3] == "," and len(string) <= 7:
+            # Single comma as a thousands separator.
+            string = string.replace(",", "")
+        else:
+            # Single comma, not thousands - probably a decimal point.
+            string = string.replace(",", ".")
+
+    elif quotes:
+        # Single quote, probably MM'SS", equivalent to a decimal point.
+        string = string.replace("'", ".")
+
+    return string
author	Joe Wreschnig <joe.wreschnig@gmail.com>
	Tue, 16 Feb 2010 01:03:52 +0000 (17:03 -0800)
committer	Joe Wreschnig <joe.wreschnig@gmail.com>
	Tue, 16 Feb 2010 01:03:52 +0000 (17:03 -0800)
collate/_abcollator.py		patch \| blob \| history
collate/_strings.py	[new file with mode: 0644]	patch \| blob