From: Joe Wreschnig Date: Tue, 16 Feb 2010 01:03:52 +0000 (-0800) Subject: _strings: Numeric string extraction routines. X-Git-Url: https://git.korewanetadesu.com/?a=commitdiff_plain;h=96cd5d3ad9dd1390c7739a6c3b9fa03ac3a2b4ff;p=python-collate.git _strings: Numeric string extraction routines. --- diff --git a/collate/_abcollator.py b/collate/_abcollator.py index 99866c3..a6ec268 100644 --- a/collate/_abcollator.py +++ b/collate/_abcollator.py @@ -9,3 +9,6 @@ class Collator(object): This split is done using Unicode's definition of whitespace. """ return string.split() + + def wordkeys(self, string): + return map(self.key, self.words) diff --git a/collate/_strings.py b/collate/_strings.py new file mode 100644 index 0000000..6bcfd9c --- /dev/null +++ b/collate/_strings.py @@ -0,0 +1,126 @@ +def alnumsplit(string): + string = unicode(string) + strings = [] + word = [] + numeric = None + for char in string: + if numeric is None: + broke = False + if char.isnumeric(): + numeric = True + elif char.isalpha(): + numeric = False + elif numeric and char.isalpha(): + broke = True + elif not numeric and char.isnumeric(): + broke = True + if broke: + if word: + strings.append("".join(word)) + word = [] + numeric = None + word.append(char) + if word: + strings.append("".join(word)) + return strings + +def wordlike(string): + """Check if a string is 'word-like'. + + Word-like strings contain at least one alphanumeric character. + """ + return any(map(type(string).isalnum, string)) + +def numeric(string, invalid=float('inf')): + string = unicode(string) + if not any(map(type(string).isnumeric, string)): + return (invalid, string) + if not string: + return (invalid, '') + + mult = 1 + while string[:1] == "-" or string[:1] == "+": + if string[0] == "-": + mult = -mult + string = string[1:] + + # Maybe we got lucky and this is a trivial case... + try: + return float(string) * mult + except ValueError: + pass + + # Otherwise we need to do this the hard way. + return mult * float(normalize_dots(string)) + +def normalize_dots(string): + string = unicode(string.strip(",.'")) + string = filter(lambda u: u.isnumeric() or u in ",.'", string) + commas = string.count(",") + stops = string.count(".") + quotes = string.count("'") + + # If anything occurs more than once, it's a separator. + if commas > 1: + string = string.replace(",", "") + commas = 0 + if stops > 1: + string = string.replace(".", "") + stops = 0 + if quotes > 1: + string = string.replace("'", "") + quotes = 0 + + def normalize_two(a, b): + # One of each - assume the first is grouping, second is point. + a_idx = string.rindex(a) + b_idx = string.rindex(b) + if a_idx > b_idx: + string = string.replace(b, "").replace(a, ".") + else: + string = string.replace(a, "").replace(b, ".") + return string + + if commas and stops and quotes: + # If all three, assume the middle is the decimal point. + # A,AAA.BB'CC + # A.AAA,BB'CC + # A,AAA'BB.CC + # A.AAA'BB,CC + # Not really valid, so do whatever we want... + # A'AAA.BB,CC + # A'AAA,BB.CC + comma_idx = string.index(",") + stops_idx = string.index(".") + quotes_idx = string.index("'") + if (comma_idx < stops_idx < quotes_idx + or quotes_idx < stops_idx < comma_idx): + string = string.replace(",", "").replace("'", "") + elif (comma_idx < quotes_idx < stops_idx + or stops_idx < quotes_idx < comma_idx): + string = string.replace(",", "").replace(".", "").replace("'", ".") + else: + string = string.replace("'", "").replace(".", "").replace(",", ".") + + elif stops and quotes: + string = normalize_two('.', "'") + + elif commas and quotes: + string = normalize_two(',', "'") + + elif commas and stops: + string = normalize_two(',', '.') + + elif commas: + if string[-4:-3] == "," and len(string) <= 7: + # Single comma as a thousands separator. + string = string.replace(",", "") + else: + # Single comma, not thousands - probably a decimal point. + string = string.replace(",", ".") + + elif quotes: + # Single quote, probably MM'SS", equivalent to a decimal point. + string = string.replace("'", ".") + + return string