"""Abstract base collator."""
+import re
+
import collate.strings
class Collator(object):
only meaningful when compared to other sort keys from the same
collator.
"""
- if isinstance(string, str):
- string = string.decode(self.encoding, 'replace')
- return string
+ return self.unicode(string)
- def words(self, string):
- """Split the string along word boundries."""
- if isinstance(string, str):
- string = string.decode(self.encoding, 'replace')
- return string.split()
+ def words(self, string, sep=re.compile(r"(\s+)", re.UNICODE)):
+ """Split the string into separate words."""
+ return re.split(sep, self.unicode(string))
def sortemekey(self, string):
"""Return a key based on sortemes of a string.
numbers if 'too much' punctuation exists in between, between
lines.
"""
- if isinstance(string, str):
- string = string.decode(self.encoding, 'replace')
-
+ string = self.unicode(string)
# Shove the sortkeyed original string on the end to resolve
# ties intelligently.
return (collate.strings.sortemes(string, self.key),
self.key(string))
+ def unicode(self, string):
+ """Convert a str to a unicode using the collator encoding."""
+ try:
+ return unicode(string)
+ except UnicodeError:
+ return string.decode(self.encoding, 'replace')
+
+ def str(self, string):
+ """Convert a unicode to a str using the collator encoding."""
+ try:
+ return str(string)
+ except UnicodeError:
+ return string.encode(self.encoding, 'replace')
+
+ def lstripwords(
+ self, string, strip=collate.strings.INITIAL_STOPS, append=u", "):
+ """Strip words and whitespace from the start of a string.
+
+ If append is not empty, it and the words stripped from the
+ front are appended to the end.
+ """
+ string = self.unicode(string)
+ stripped = []
+ words = self.words(string)
+ while words and (words[0].isspace() or words[0].lower() in strip):
+ stripped.append(words.pop(0))
+ while stripped and stripped[-1].isspace():
+ stripped.pop()
+ if append and stripped:
+ if words:
+ words.append(append)
+ words.extend(stripped)
+ return u"".join(words)
+
+ def lstripsortemekey(
+ self, string, strip=collate.strings.INITIAL_STOPS, append=u", "):
+ """Return a key based on sortemes of a prefix-stripped string."""
+ string = self.unicode(string)
+ stripped = self.lstripwords(string, strip, append)
+ return (self.sortemekey(stripped), self.key(string))
def words(self, string):
"""Split the string along word boundries."""
- if isinstance(string, str):
- string = string.decode(self.encoding)
- words = self._breaker.words(string)
- return [w for w in words if not w.isspace()]
+ string = self.unicode(string)
+ return self._breaker.words(string)
def key(self, string):
"""Sort key for a string.
instance according to the 'encoding' attribute of the
Collator.
"""
- if isinstance(string, str):
- string = string.decode(self.encoding, 'replace')
+ string = self.unicode(string)
return self._collator.key(string)
__all__ = ["Collator"]
import locale
-import re
import collate.errors
import collate._abcollator
"""
try:
return locale.strxfrm(string)
- except UnicodeEncodeError:
- return locale.strxfrm(string.encode(self.encoding, "replace"))
-
- def words(self, string, sep=re.compile(r"\W+", re.UNICODE)):
- """Split the string into separate words."""
- if isinstance(string, str):
- string = string.decode(self.encoding, 'replace')
- return re.split(sep, string)
-
+ except UnicodeError:
+ return locale.strxfrm(string.str(self.encoding))