From 9a7cf6459c40d53b58634f2df56386bf52c12f7c Mon Sep 17 00:00:00 2001 From: Joe Wreschnig Date: Fri, 19 Feb 2010 01:23:57 -0800 Subject: [PATCH] More tweaks; notably try to insert paragraph breaks rather than a separate Python tuple when re-concatenating strings. --- collate/_abcollator.py | 20 +++++++++++++++++--- collate/icu/__init__.py | 7 +++++++ collate/{_strings.py => strings.py} | 14 +++++++++----- 3 files changed, 33 insertions(+), 8 deletions(-) rename collate/{_strings.py => strings.py} (95%) diff --git a/collate/_abcollator.py b/collate/_abcollator.py index fdd7783..0ae5d45 100644 --- a/collate/_abcollator.py +++ b/collate/_abcollator.py @@ -1,14 +1,28 @@ -import collate._strings +import collate.strings class Collator(object): def cmp(self, string1, string2): """Return negative if a < b, zero if a == b, positive if a > b.""" return cmp(self.key(string1), self.key(string2)) + def words(self, string): + """Split the string along word boundries.""" + if isinstance(string, str): + string = string.decode(self.encoding, 'replace') + return string.split() + def sortemekey(self, string, invalid=float('inf')): + """Return a key based on sortemes of a string. + + If the string is a str instance, it is decoded to a unicode + instance according to the 'encoding' attribute of the + Collator. + """ keys = [] - for sorteme in collate._strings.sortemes(string): - num, alpha = collate._strings.numeric(sorteme, invalid) + if isinstance(string, str): + string = string.decode(self.encoding, 'replace') + for sorteme in collate.strings.sortemes(string): + num, alpha = collate.strings.numeric(sorteme, invalid) if num == invalid: keys.append(self.key(alpha)) else: diff --git a/collate/icu/__init__.py b/collate/icu/__init__.py index 892b8a1..5f3ec05 100644 --- a/collate/icu/__init__.py +++ b/collate/icu/__init__.py @@ -36,6 +36,13 @@ class Collator(collate._abcollator.Collator): # so this is a harmless error. self._breaker = _icu.WordBreaker("root") + def words(self, string): + """Split the string along word boundries.""" + if isinstance(string, str): + string = string.decode(self.encoding) + words = self._breaker.words(string) + return [w for w in words if not w.isspace()] + def key(self, string): """Sort key for a string. diff --git a/collate/_strings.py b/collate/strings.py similarity index 95% rename from collate/_strings.py rename to collate/strings.py index aed2ba7..bc4ed62 100644 --- a/collate/_strings.py +++ b/collate/strings.py @@ -10,6 +10,8 @@ CONTINUE_ON = frozenset([ UNKNOWN, LETTER, NUMBER = range(3) +BREAKER = u"\u2029" + def sortemes(string): """Generate a list of sortemes for the string. @@ -32,6 +34,9 @@ def sortemes(string): mode = UNKNOWN previous_mode = UNKNOWN category = "XX" + + # TODO(jfw): This kind of evolved over time, there's probably a much + # faster / more concise way to express it now. for i, c in enumerate(string): broke = False prev_category = category @@ -72,14 +77,13 @@ def sortemes(string): # If we read two strings separated by weird punctuation, # pretend the punctuation isn't there. if (this_mode == previous_mode == LETTER - and (category[0] == "P" or prev_category[0] == "P") and words): - words[-1] += u" " + string[start:last+1] + words[-1] += BREAKER + string[start:last+1] else: # This ensures "foo2 bar" sorts as ["foo ", 2, "bar"] # Which sorts after ["foo", "bar"]. if this_mode == NUMBER and previous_mode == LETTER and words: - words[-1] += u" " + words[-1] += BREAKER words.append(string[start:last+1]) previous_mode = this_mode @@ -91,10 +95,10 @@ def sortemes(string): this_mode = mode if start is not None and last is not None: if this_mode == LETTER and previous_mode == LETTER and words: - words[-1] += u" " + string[start:last+1] + words[-1] += BREAKER + string[start:last+1] else: if this_mode == NUMBER and previous_mode == LETTER and words: - words[-1] += u" " + words[-1] += BREAKER words.append(string[start:last+1]) return words -- 2.30.2