From d46d035bdc1ef7276af7c41880034226d0cdfbfc Mon Sep 17 00:00:00 2001 From: Joe Wreschnig Date: Tue, 16 Feb 2010 00:10:44 -0800 Subject: [PATCH] Calculate sortemes using simply alnum splitting rather than word breaks. Faster and slightly more accurate for our purposes. Strip punctuation. --- collate/_abcollator.py | 13 ++++++------- collate/_strings.py | 33 ++++++++++++++++++++++++--------- 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/collate/_abcollator.py b/collate/_abcollator.py index bc43dc3..02cb733 100644 --- a/collate/_abcollator.py +++ b/collate/_abcollator.py @@ -13,12 +13,11 @@ class Collator(object): return string.split() def sortemes(self, string): - words = [] - for word in self.words(string): - words.extend(collate._strings.alnumsplit(word)) - return filter(collate._strings.wordlike, words) + return collate._strings.alnumsplit(string) - def sortemekey(self, string): - words = map(collate._strings.numeric, self.sortemes(string)) - words = [(i, self.key(word)) for (i, word) in words] + def sortemekey(self, string, invalid=float('inf')): + words = [] + for sorteme in self.sortemes(string): + num, alpha = collate._strings.numeric(sorteme, invalid) + words.append((num, self.key(alpha))) return words diff --git a/collate/_strings.py b/collate/_strings.py index fd18bb9..f81bfd7 100644 --- a/collate/_strings.py +++ b/collate/_strings.py @@ -1,11 +1,20 @@ import unicodedata +def strip_nonalnum(string): + while string and not (string[0].isalpha() or string[0].isnumeric()): + string = string[1:] + while string and not (string[-1].isalpha() or string[-1].isnumeric()): + string = string[:-1] + return string + def alnumsplit(string): + if not string: + return [] string = unicode(string) strings = [] - word = [] numeric = None - for char in string: + start = 0 + for i, char in enumerate(string): if numeric is None: broke = False if char.isnumeric(): @@ -14,16 +23,18 @@ def alnumsplit(string): numeric = False elif numeric and char.isalpha(): broke = True + numeric = False + elif numeric and char.isspace(): + broke = True + numeric = None elif not numeric and char.isnumeric(): broke = True + numeric = True if broke: - if word: - strings.append(u"".join(word)) - word = [] - numeric = None - word.append(char) - if word: - strings.append(u"".join(word)) + strings.append(strip_nonalnum(string[start:i])) + start = i + broke = False + strings.append(strip_nonalnum(string[start:i + 1])) return strings def wordlike(string): @@ -44,6 +55,7 @@ def wordlike(string): def numeric(orig, invalid=float('inf')): if not orig: return (invalid, '') + string = unicode(orig) for c in string: if c.isnumeric(): @@ -57,6 +69,9 @@ def numeric(orig, invalid=float('inf')): mult = -mult string = string[1:] + if not string[:1].isnumeric(): + return (invalid, orig) + # Early out if possible. try: return (float(string) * mult, orig) -- 2.30.2