From: Joe Wreschnig Date: Thu, 25 Feb 2010 23:54:14 +0000 (-0800) Subject: strings: Microoptimizations, saves about 10% of runtime. X-Git-Url: https://git.korewanetadesu.com/?a=commitdiff_plain;h=a3bd28edb9b44a1531af3ef8c4ae9cd6a4e2d3b3;p=python-collate.git strings: Microoptimizations, saves about 10% of runtime. --- diff --git a/collate/strings.py b/collate/strings.py index e2750d1..487257f 100644 --- a/collate/strings.py +++ b/collate/strings.py @@ -62,25 +62,23 @@ def sortemes(string, key=lambda s: s): """ + if not string: + return [] + words = [] letters = [] digits = [] - if not string: - return words + lappend = letters.append + dappend = digits.append string = unicode(string) categories = map(unicodedata.category, string) previous = UNKNOWN + wappend = words.append + join = u"".join + i = 0 - def aletters(letters): - """Add a group of letters to the word list.""" - words.append((INFINITY, stripends(letters))) - def adigits(digits): - """Add a group of digits to the word list.""" - words.append((numeric(digits), u'')) - - # TODO(jfw): This kind of evolved over time, there's probably a much - # faster / more concise way to express it now. - for i, (uchar, category) in enumerate(zip(string, categories)): + for uchar in string: + category = categories[i] if letters and previous == LETTER and words: word = stripends(words.pop()[1].strip()) + BREAKER @@ -90,57 +88,61 @@ def sortemes(string, key=lambda s: s): # Split at the first letter following a number or # non-continuing character. if category[0] == "L": - letters.append(uchar) + lappend(uchar) if digits: - adigits(u"".join(digits).strip()) - digits = [] + words.append((numeric(join(digits).strip()), u'')) + del(digits[:]) previous = NUMBER # Split at the first number following a non-number or # non-continuing character. elif category[0] == "N": - digits.append(uchar) + dappend(uchar) if letters: if unicodedata.category(letters[-1])[0] == "L": - letters.append(HBREAKER) - aletters(u"".join(letters)) - letters = [] + lappend(HBREAKER) + wappend((INFINITY, stripends(join(letters)))) + del(letters[:]) previous = LETTER # Only certain punctuation allowed in numbers. elif digits and uchar not in ALLOWED_IN_NUMBERS: - adigits(u"".join(digits)) - digits = [] + words.append((numeric(join(digits)), u'')) + del(digits[:]) previous = NUMBER # Split if we find a non-continuing character ("weird" ones). - elif letters and category not in CONTINUE_ON: + elif category not in CONTINUE_ON: if letters: - aletters(u"".join(letters).strip() + BREAKER) - letters = [] + wappend( + (INFINITY, + stripends(join(letters).strip() + BREAKER))) + del(letters[:]) previous = LETTER if digits: - adigits(u"".join(digits).strip()) - digits = [] + words.append((numeric(join(digits)), u'')) + del(digits[:]) previous = NUMBER # Split if we find two pieces of punctuation in a row, even # if we should otherwise continue. - elif i and categories[i-1][0] in "P" and category[0] in "P": + elif i and categories[i - 1][0] == category[0] == "P": if letters: - aletters(u"".join(letters)) - letters = [] + wappend((INFINITY, stripends(join(letters)))) + del(letters[:]) previous = LETTER if digits: - adigits(u"".join(digits)) - digits = [] + words.append((numeric(join(digits)), u'')) + del(digits[:]) previous = NUMBER else: if digits: - digits.append(uchar) + dappend(uchar) elif letters: - letters.append(uchar) + lappend(uchar) + + i += 1 if letters and previous == LETTER and words: word = stripends(words.pop()[1].strip()) + BREAKER @@ -148,11 +150,11 @@ def sortemes(string, key=lambda s: s): previous = UNKNOWN if letters: - aletters(u"".join(letters)) + wappend((INFINITY, stripends(join(letters)))) if digits: - adigits(u"".join(digits)) + words.append((numeric(join(digits)), u'')) - return [(i, key(w) if w else u'') for i, w in words] + return [(i, key(w)) for i, w in words] def numeric(orig, invalid=INFINITY): """Parse a number out of a string.