From f1717db91d1ab7b37937cbe5ab17965cb8b7b592 Mon Sep 17 00:00:00 2001 From: Joe Wreschnig Date: Sun, 21 Feb 2010 21:57:32 -0800 Subject: [PATCH] Simplify algorithm. No changes to sort behavior or runtime. --- collate/_abcollator.py | 12 +++----- collate/strings.py | 63 ++++++++++++++++++++++-------------------- 2 files changed, 37 insertions(+), 38 deletions(-) diff --git a/collate/_abcollator.py b/collate/_abcollator.py index 0ae5d45..9dce32b 100644 --- a/collate/_abcollator.py +++ b/collate/_abcollator.py @@ -18,15 +18,11 @@ class Collator(object): instance according to the 'encoding' attribute of the Collator. """ - keys = [] if isinstance(string, str): string = string.decode(self.encoding, 'replace') - for sorteme in collate.strings.sortemes(string): - num, alpha = collate.strings.numeric(sorteme, invalid) - if num == invalid: - keys.append(self.key(alpha)) - else: - keys.append(num) + # Shove the sortkeyed original string on the end to resolve # ties intelligently. - return (keys, self.key(string)) + return (collate.strings.sortemes(string, self.key), + self.key(string)) + diff --git a/collate/strings.py b/collate/strings.py index 5717246..1dffdac 100644 --- a/collate/strings.py +++ b/collate/strings.py @@ -11,8 +11,9 @@ CONTINUE_ON = frozenset([ UNKNOWN, LETTER, NUMBER = range(3) BREAKER = u"\u2029" # Paragraph break character +INFINITY = float('inf') -def sortemes(string): +def sortemes(string, key=lambda s: s): """Generate a list of sortemes for the string. A sorteme, by analogy with grapheme/morpheme/etc. is an atom of @@ -42,13 +43,18 @@ def sortemes(string): word = word[:-1] return word + def aletters(letters): + words.append((INFINITY, stripends(letters))) + def adigits(digits): + words.append((numeric(digits), u'')) + # TODO(jfw): This kind of evolved over time, there's probably a much # faster / more concise way to express it now. for i, (c, category) in enumerate(zip(string, categories)): if letters and previous == LETTER and words: - word = stripends(words.pop().strip()) - letters = list(stripends(word).strip() + BREAKER) + letters + word = stripends(words.pop()[1].strip()) + BREAKER + letters.insert(0, word) previous = UNKNOWN # Split at the first letter following a number or @@ -56,47 +62,47 @@ def sortemes(string): if category[0] == "L": letters.append(c) if digits: - words.append(u"".join(digits).strip()) - previous = NUMBER + adigits(u"".join(digits).strip()) digits = [] + previous = NUMBER # Split at the first number following a non-number or # non-continuing character. elif category[0] == "N": digits.append(c) if letters: - words.append(u"".join(letters)) - previous = LETTER + aletters(u"".join(letters)) letters = [] + previous = LETTER # Only certain punctuation allowed in numbers. elif digits and c not in "',._": - words.append(u"".join(digits)) - previous = NUMBER + adigits(u"".join(digits)) digits = [] + previous = NUMBER # Split if we find a non-continuing character ("weird" ones). elif letters and category not in CONTINUE_ON: if letters: - words.append(u"".join(letters).strip() + BREAKER) - previous = LETTER + aletters(u"".join(letters).strip() + BREAKER) letters = [] + previous = LETTER if digits: - words.append(u"".join(digits).strip() + BREAKER) - previous = NUMBER + adigits(u"".join(digits).strip()) digits = [] + previous = NUMBER # Split if we find two pieces of punctuation in a row, even # if we should otherwise continue. elif i and categories[i-1][0] in "P" and category[0] in "P": if letters: - words.append(u"".join(letters)) - previous = LETTER + aletters(u"".join(letters)) letters = [] + previous = LETTER if digits: - words.append(u"".join(digits)) - previous = NUMBER + adigits(u"".join(digits)) digits = [] + previous = NUMBER else: if digits: @@ -105,30 +111,27 @@ def sortemes(string): letters.append(c) if letters and previous == LETTER and words: - word = stripends(words.pop().strip()) - letters = list(stripends(word).strip() + BREAKER) + letters + word = stripends(words.pop()[1].strip()) + BREAKER + letters.insert(0, word) previous = UNKNOWN if letters: - words.append(u"".join(letters)) - letters = [] + aletters(u"".join(letters)) if digits: - words.append(u"".join(digits)) - digits = [] + adigits(u"".join(digits)) - words = map(stripends, words) - return words + return [(i, key(w) if w else u'') for i, w in words] def numeric(orig, invalid=float('inf')): if not orig: - return (invalid, '') + return invalid string = unicode(orig) for c in string: if c.isnumeric(): break else: - return (invalid, orig) + return invalid mult = 1 while string[:1] == u"-" or string[:1] == u"+": @@ -143,7 +146,7 @@ def numeric(orig, invalid=float('inf')): # Early out if possible. try: - return (float(string) * mult, orig) + return float(string) * mult except ValueError: pass @@ -161,9 +164,9 @@ def numeric(orig, invalid=float('inf')): whole, frac = string.split(".") whole = _numeric(whole) frac = _numeric(frac) / (10.0 ** len(frac)) - return (mult * (whole + frac), orig) + return mult * (whole + frac) except ValueError: - return (mult * _numeric(string), orig) + return mult * _numeric(string) def normalize_punc(string): string = unicode(string.strip(u",.'")) -- 2.20.1