From 53e1676b8d68cccd2b0692654d3871e44e0ba6b6 Mon Sep 17 00:00:00 2001 From: Joe Wreschnig Date: Mon, 15 Feb 2010 20:49:57 -0800 Subject: [PATCH] 'Advanced' sorteme functions. --- collate/_abcollator.py | 14 +++++- collate/_strings.py | 112 +++++++++++++++++++++++++++-------------- pycollate | 2 +- 3 files changed, 87 insertions(+), 41 deletions(-) diff --git a/collate/_abcollator.py b/collate/_abcollator.py index a6ec268..bc43dc3 100644 --- a/collate/_abcollator.py +++ b/collate/_abcollator.py @@ -1,3 +1,5 @@ +import collate._strings + class Collator(object): def cmp(self, string1, string2): """Return negative if a < b, zero if a == b, positive if a > b.""" @@ -10,5 +12,13 @@ class Collator(object): """ return string.split() - def wordkeys(self, string): - return map(self.key, self.words) + def sortemes(self, string): + words = [] + for word in self.words(string): + words.extend(collate._strings.alnumsplit(word)) + return filter(collate._strings.wordlike, words) + + def sortemekey(self, string): + words = map(collate._strings.numeric, self.sortemes(string)) + words = [(i, self.key(word)) for (i, word) in words] + return words diff --git a/collate/_strings.py b/collate/_strings.py index 6bcfd9c..fd18bb9 100644 --- a/collate/_strings.py +++ b/collate/_strings.py @@ -1,3 +1,5 @@ +import unicodedata + def alnumsplit(string): string = unicode(string) strings = [] @@ -16,12 +18,12 @@ def alnumsplit(string): broke = True if broke: if word: - strings.append("".join(word)) + strings.append(u"".join(word)) word = [] numeric = None word.append(char) if word: - strings.append("".join(word)) + strings.append(u"".join(word)) return strings def wordlike(string): @@ -29,56 +31,84 @@ def wordlike(string): Word-like strings contain at least one alphanumeric character. """ - return any(map(type(string).isalnum, string)) -def numeric(string, invalid=float('inf')): - string = unicode(string) - if not any(map(type(string).isnumeric, string)): - return (invalid, string) - if not string: + # Explicit loop is faster than: + #return any(map(type(string).isalnum, string)) + + for c in string: + if c.isalnum(): + return True + else: + return False + +def numeric(orig, invalid=float('inf')): + if not orig: return (invalid, '') + string = unicode(orig) + for c in string: + if c.isnumeric(): + break + else: + return (invalid, orig) mult = 1 - while string[:1] == "-" or string[:1] == "+": - if string[0] == "-": + while string[:1] == u"-" or string[:1] == u"+": + if string[:1] == u"-": mult = -mult string = string[1:] - # Maybe we got lucky and this is a trivial case... + # Early out if possible. try: - return float(string) * mult + return (float(string) * mult, orig) except ValueError: pass # Otherwise we need to do this the hard way. - return mult * float(normalize_dots(string)) + string = normalize_punc(string) + + def _numeric(string): + total = 0 + for c in string: + v = unicodedata.numeric(c) + if v >= 1: + total *= 10 + total += v + return total + + try: + whole, frac = string.split(".") + whole = _numeric(whole) + frac = _numeric(frac) / (10.0 ** len(frac)) + return (mult * (whole + frac), orig) + except ValueError: + return (mult * _numeric(string), orig) -def normalize_dots(string): - string = unicode(string.strip(",.'")) - string = filter(lambda u: u.isnumeric() or u in ",.'", string) - commas = string.count(",") - stops = string.count(".") - quotes = string.count("'") +def normalize_punc(string): + string = unicode(string.strip(u",.'")) + string = filter(lambda u: u.isnumeric() or u in u",.'", string) + commas = string.count(u",") + stops = string.count(u".") + quotes = string.count(u"'") # If anything occurs more than once, it's a separator. if commas > 1: - string = string.replace(",", "") + string = string.replace(u",", u"") commas = 0 if stops > 1: - string = string.replace(".", "") + string = string.replace(u".", u"") stops = 0 if quotes > 1: - string = string.replace("'", "") + string = string.replace(u"'", u"") quotes = 0 - def normalize_two(a, b): + def normalize_two(a, b, string): # One of each - assume the first is grouping, second is point. a_idx = string.rindex(a) b_idx = string.rindex(b) if a_idx > b_idx: - string = string.replace(b, "").replace(a, ".") + string = string.replace(b, u"").replace(a, u".") else: - string = string.replace(a, "").replace(b, ".") + string = string.replace(a, u"").replace(b, u".") return string if commas and stops and quotes: @@ -90,37 +120,43 @@ def normalize_dots(string): # Not really valid, so do whatever we want... # A'AAA.BB,CC # A'AAA,BB.CC - comma_idx = string.index(",") - stops_idx = string.index(".") - quotes_idx = string.index("'") + comma_idx = string.index(u",") + stops_idx = string.index(u".") + quotes_idx = string.index(u"'") if (comma_idx < stops_idx < quotes_idx or quotes_idx < stops_idx < comma_idx): - string = string.replace(",", "").replace("'", "") + string = string.replace(u",", u"").replace(u"'", u"") elif (comma_idx < quotes_idx < stops_idx or stops_idx < quotes_idx < comma_idx): - string = string.replace(",", "").replace(".", "").replace("'", ".") + string = string.replace( + u",", u"").replace( + u".", u"").replace( + u"'", u".") else: - string = string.replace("'", "").replace(".", "").replace(",", ".") + string = string.replace( + u"'", u"").replace( + u".", u"").replace( + u",", u".") elif stops and quotes: - string = normalize_two('.', "'") + string = normalize_two(u".", u"'", string) elif commas and quotes: - string = normalize_two(',', "'") + string = normalize_two(u",", u"'", string) elif commas and stops: - string = normalize_two(',', '.') + string = normalize_two(u",", u".", string) elif commas: - if string[-4:-3] == "," and len(string) <= 7: + if string[-4:-3] == u"," and len(string) <= 7: # Single comma as a thousands separator. - string = string.replace(",", "") + string = string.replace(u",", u"") else: # Single comma, not thousands - probably a decimal point. - string = string.replace(",", ".") + string = string.replace(u",", u".") elif quotes: # Single quote, probably MM'SS", equivalent to a decimal point. - string = string.replace("'", ".") + string = string.replace(u"'", u".") return string diff --git a/pycollate b/pycollate index ec7f9f9..e38e91d 100755 --- a/pycollate +++ b/pycollate @@ -54,7 +54,7 @@ def main(argv): line = line.strip() line = line.decode(encoding, "replace") lines.append(line) - lines.sort(key=collate.key) + lines.sort(key=collate.collator.sortemekey) for line in lines: print line.encode(encoding, "replace") -- 2.30.2