X-Git-Url: https://git.korewanetadesu.com/?p=python-collate.git;a=blobdiff_plain;f=collate%2F_abcollator.py;fp=collate%2F_abcollator.py;h=dd7ea1430445548736e8f3d754ab9325210ac117;hp=622766d3b5056c27ca9b9a98ce4609961dd11c1d;hb=308778ae560a3258a55d578b1dd52d030ce4399d;hpb=f854c6958fd98bc3a1709e7aa1ede1ce7f5ab612 diff --git a/collate/_abcollator.py b/collate/_abcollator.py index 622766d..dd7ea14 100644 --- a/collate/_abcollator.py +++ b/collate/_abcollator.py @@ -1,5 +1,7 @@ """Abstract base collator.""" +import re + import collate.strings class Collator(object): @@ -27,15 +29,11 @@ class Collator(object): only meaningful when compared to other sort keys from the same collator. """ - if isinstance(string, str): - string = string.decode(self.encoding, 'replace') - return string + return self.unicode(string) - def words(self, string): - """Split the string along word boundries.""" - if isinstance(string, str): - string = string.decode(self.encoding, 'replace') - return string.split() + def words(self, string, sep=re.compile(r"(\s+)", re.UNICODE)): + """Split the string into separate words.""" + return re.split(sep, self.unicode(string)) def sortemekey(self, string): """Return a key based on sortemes of a string. @@ -47,11 +45,49 @@ class Collator(object): numbers if 'too much' punctuation exists in between, between lines. """ - if isinstance(string, str): - string = string.decode(self.encoding, 'replace') - + string = self.unicode(string) # Shove the sortkeyed original string on the end to resolve # ties intelligently. return (collate.strings.sortemes(string, self.key), self.key(string)) + def unicode(self, string): + """Convert a str to a unicode using the collator encoding.""" + try: + return unicode(string) + except UnicodeError: + return string.decode(self.encoding, 'replace') + + def str(self, string): + """Convert a unicode to a str using the collator encoding.""" + try: + return str(string) + except UnicodeError: + return string.encode(self.encoding, 'replace') + + def lstripwords( + self, string, strip=collate.strings.INITIAL_STOPS, append=u", "): + """Strip words and whitespace from the start of a string. + + If append is not empty, it and the words stripped from the + front are appended to the end. + """ + string = self.unicode(string) + stripped = [] + words = self.words(string) + while words and (words[0].isspace() or words[0].lower() in strip): + stripped.append(words.pop(0)) + while stripped and stripped[-1].isspace(): + stripped.pop() + if append and stripped: + if words: + words.append(append) + words.extend(stripped) + return u"".join(words) + + def lstripsortemekey( + self, string, strip=collate.strings.INITIAL_STOPS, append=u", "): + """Return a key based on sortemes of a prefix-stripped string.""" + string = self.unicode(string) + stripped = self.lstripwords(string, strip, append) + return (self.sortemekey(stripped), self.key(string))