From 308778ae560a3258a55d578b1dd52d030ce4399d Mon Sep 17 00:00:00 2001 From: Joe Wreschnig Date: Thu, 25 Feb 2010 20:49:40 -0800 Subject: [PATCH] Collator.lstripwords: Strip words off the start and append to the end. Collate.unicode, Collate.str: Convenience recoding routines. --- NEWS.txt | 2 ++ collate/__init__.py | 2 +- collate/_abcollator.py | 58 +++++++++++++++++++++++++++++++++-------- collate/icu/__init__.py | 9 +++---- collate/strings.py | 2 ++ collate/syslocale.py | 12 ++------- setup.py | 2 +- 7 files changed, 58 insertions(+), 29 deletions(-) diff --git a/NEWS.txt b/NEWS.txt index 823498d..c2cbbc0 100644 --- a/NEWS.txt +++ b/NEWS.txt @@ -1,6 +1,8 @@ 2010 - 0.2 - Dedicated to 2 (U+FF12) - 直感~ before 直感2~. - Detect Unicode Roman numeral codepoints and sort them numerically. + - lstripwords and lstripsortemekey methods, e.g. sort "A Perfect Sky" + as "Perfect Sky, A". 2010.02.22 - 0.1 - Dedicated to 勘 (U+52D8) - Initial release. diff --git a/collate/__init__.py b/collate/__init__.py index 5094071..ea1516a 100644 --- a/collate/__init__.py +++ b/collate/__init__.py @@ -52,7 +52,7 @@ try: except ImportError: pass -VERSION = (0, 1) +VERSION = (0, 2) VERSION_STRING = ".".join(map(str, VERSION)) collator = None diff --git a/collate/_abcollator.py b/collate/_abcollator.py index 622766d..dd7ea14 100644 --- a/collate/_abcollator.py +++ b/collate/_abcollator.py @@ -1,5 +1,7 @@ """Abstract base collator.""" +import re + import collate.strings class Collator(object): @@ -27,15 +29,11 @@ class Collator(object): only meaningful when compared to other sort keys from the same collator. """ - if isinstance(string, str): - string = string.decode(self.encoding, 'replace') - return string + return self.unicode(string) - def words(self, string): - """Split the string along word boundries.""" - if isinstance(string, str): - string = string.decode(self.encoding, 'replace') - return string.split() + def words(self, string, sep=re.compile(r"(\s+)", re.UNICODE)): + """Split the string into separate words.""" + return re.split(sep, self.unicode(string)) def sortemekey(self, string): """Return a key based on sortemes of a string. @@ -47,11 +45,49 @@ class Collator(object): numbers if 'too much' punctuation exists in between, between lines. """ - if isinstance(string, str): - string = string.decode(self.encoding, 'replace') - + string = self.unicode(string) # Shove the sortkeyed original string on the end to resolve # ties intelligently. return (collate.strings.sortemes(string, self.key), self.key(string)) + def unicode(self, string): + """Convert a str to a unicode using the collator encoding.""" + try: + return unicode(string) + except UnicodeError: + return string.decode(self.encoding, 'replace') + + def str(self, string): + """Convert a unicode to a str using the collator encoding.""" + try: + return str(string) + except UnicodeError: + return string.encode(self.encoding, 'replace') + + def lstripwords( + self, string, strip=collate.strings.INITIAL_STOPS, append=u", "): + """Strip words and whitespace from the start of a string. + + If append is not empty, it and the words stripped from the + front are appended to the end. + """ + string = self.unicode(string) + stripped = [] + words = self.words(string) + while words and (words[0].isspace() or words[0].lower() in strip): + stripped.append(words.pop(0)) + while stripped and stripped[-1].isspace(): + stripped.pop() + if append and stripped: + if words: + words.append(append) + words.extend(stripped) + return u"".join(words) + + def lstripsortemekey( + self, string, strip=collate.strings.INITIAL_STOPS, append=u", "): + """Return a key based on sortemes of a prefix-stripped string.""" + string = self.unicode(string) + stripped = self.lstripwords(string, strip, append) + return (self.sortemekey(stripped), self.key(string)) diff --git a/collate/icu/__init__.py b/collate/icu/__init__.py index e7dfe2d..6d27a6f 100644 --- a/collate/icu/__init__.py +++ b/collate/icu/__init__.py @@ -41,10 +41,8 @@ class Collator(collate._abcollator.Collator): def words(self, string): """Split the string along word boundries.""" - if isinstance(string, str): - string = string.decode(self.encoding) - words = self._breaker.words(string) - return [w for w in words if not w.isspace()] + string = self.unicode(string) + return self._breaker.words(string) def key(self, string): """Sort key for a string. @@ -53,7 +51,6 @@ class Collator(collate._abcollator.Collator): instance according to the 'encoding' attribute of the Collator. """ - if isinstance(string, str): - string = string.decode(self.encoding, 'replace') + string = self.unicode(string) return self._collator.key(string) diff --git a/collate/strings.py b/collate/strings.py index 487257f..16ba8be 100644 --- a/collate/strings.py +++ b/collate/strings.py @@ -40,6 +40,8 @@ ROMAN = { u"\u2188": 100000, } +INITIAL_STOPS = frozenset([u"a", u"an", u"the"]) + def stripends(word): """Strip punctuation and symbols from the ends of a string.""" while word and unicodedata.category(word[0])[0] in "PS": diff --git a/collate/syslocale.py b/collate/syslocale.py index 5b8adca..e48ee82 100644 --- a/collate/syslocale.py +++ b/collate/syslocale.py @@ -25,7 +25,6 @@ Avoid this backend if... __all__ = ["Collator"] import locale -import re import collate.errors import collate._abcollator @@ -54,12 +53,5 @@ class Collator(collate._abcollator.Collator): """ try: return locale.strxfrm(string) - except UnicodeEncodeError: - return locale.strxfrm(string.encode(self.encoding, "replace")) - - def words(self, string, sep=re.compile(r"\W+", re.UNICODE)): - """Split the string into separate words.""" - if isinstance(string, str): - string = string.decode(self.encoding, 'replace') - return re.split(sep, string) - + except UnicodeError: + return locale.strxfrm(string.str(self.encoding)) diff --git a/setup.py b/setup.py index 99a13af..e47d620 100755 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ else: libraries = ['icui18n', 'icuuc', 'icudata'] setup(name='collate', - version='0.1', + version='0.2', author="Joe Wreschnig", author_email="joe.wreschnig@gmail.com", description="Python text collation", -- 2.20.1