--- /dev/null
+include README.txt
+include tests/*/*.list.txt
+include tests/*/*.py
+include ZPL.txt
+include pycollate
+"""collate - Sort strings intelligently.
+
+This module provides tools to sort strings in a 'human-expected' order.
+
+Because human expectations are fuzzy and often self-contradictory, the
+sort order is not guaranteed to be stable between versions of this
+module (rather the opposite - the primary reason to update it will
+probably be changed sort results).
+
+If available, this module uses the ICU localization library.
+Otherwise, it uses the system's locale database (and produces
+significantly worse results).
+
+Trivial Use:
+------------
+strings = read_strings(...)
+strings.sort(key=collate.key)
+
+Attributes:
+-----------
+backend - The default collation backend. If available, this is
+ collate.icu; otherwise, it is collate.syslocale. In special
+ situations, it may be collate.codepoint.
+
+collate - The default collator. This is the collator of the default
+ backend instantiated with the default system locale and encoding.
+
+"""
+
+__all__ = ["collator", "set_collator", "cmp", "key", "default",
+ "VERSION", "VERSION_STRING"]
+
import collate.errors
import collate._locale
except ImportError:
pass
+VERSION = (0, 1)
+VERSION_STRING = ".".join(map(str, VERSION))
+
collator = None
-def set_default(backend=None, locale=None, encoding=None):
+def set_collator(backend=None, locale=None, encoding=None):
+ """Set the default collation backend.
+
+ This function tries very hard not to fail; the resulting Collator
+ may not have the locale or encoding you specified (at the very
+ least, they will be normalized). Remember to check
+ collator.locale and collator.encoding.
+
+ Arguments:
+ backend - 'icu', 'syslocale', or 'codepoint'; None to not change.
+ locale - e.g. 'en_US', or None for the system locale.
+ encoding - e.g. 'utf-8', or None for the system locale encoding
+
+ Returns:
+ The new default Collator instance, or None if no collator could be
+ created; if None is returned, the existing default Collator is
+ left intact.
+
+ """
global collator
global default
if backend is None:
backend = default
locales = collate._locale.localelist(locale)
- c = None
+ possible = None
for locale in locales:
locale, encoding_ = collate._locale.getpair(locale, encoding)
try:
- c = backend.Collator(locale, encoding_)
+ possible = backend.Collator(locale, encoding_)
except collate.errors.InvalidLocaleError:
pass
else:
break
- if c is not None:
- collator = c
+ if possible is not None:
+ collator = possible
default = backend
+ return collator
+ return possible
def key(string):
- return collator.key(string)
+ """Return a good sorting key for the string.
+
+ The sort key should be considered an opaque value which is only
+ meaningful when compared to other sort keys from the same
+ collator.
+
+ This is the same as collate.collator.sortemekey(string).
+ """
+ return collator.sortemekey(string)
def cmp(a, b):
+ """Return negative if a < b, zero if a == b, positive if a > b.
+
+ This is the same as collate.collator.cmp(a, b).
+ """
return collator.cmp(a, b)
-set_default()
+set_collator()
+
+if collator is None:
+ raise collate.errors.InvalidLocaleError("C")
+"""Abstract base collator."""
+
import collate.strings
class Collator(object):
+ """Abstract base class for Collators.
+
+ Attributes:
+ locale - the collator follows rules for this locale
+ encoding - assumed string encoding
+ """
+
+ locale = "C"
encoding = "ascii"
+ def __init__(self, locale=None, encoding=None):
+ pass
+
def cmp(self, string1, string2):
"""Return negative if a < b, zero if a == b, positive if a > b."""
return cmp(self.key(string1), self.key(string2))
def key(self, string):
+ """Return a good sorting key for the string.
+
+ The sort key should be considered an opaque value which is
+ only meaningful when compared to other sort keys from the same
+ collator.
+ """
+ if isinstance(string, str):
+ string = string.decode(self.encoding, 'replace')
return string
def words(self, string):
def sortemekey(self, string):
"""Return a key based on sortemes of a string.
- If the string is a str instance, it is decoded to a unicode
- instance according to the 'encoding' attribute of the
- Collator.
+ A sorteme, by analogy with grapheme/morpheme/etc. is an atom
+ of sort information. This is larger than a word boundry but
+ smaller than a sentence boundry; roughly, a sorteme boundry
+ occurs between letters and numbers, between numbers and
+ numbers if 'too much' punctuation exists in between, between
+ lines.
"""
if isinstance(string, str):
string = string.decode(self.encoding, 'replace')
+++ /dev/null
-VERSION = 0.1
"""Locale utility routines."""
+__all__ = ["localelist", "encoding", "getpair"]
+
import sys
try:
except ImportError:
codecs = None
-__all__ = ["localelist"]
-
def localelist(*locales):
"""Normalize and return a list of locales, with appended defaults.
return locale.getpreferredencoding() or sys.getdefaultencoding()
def getpair(locale_, encoding_):
+ """If encoding_ is unspecified, get the locale encoding."""
if "." in locale_:
if encoding_ is None:
locale_, encoding_ = locale_.rsplit(".", 1)
"""
+__all__ = ["Collator"]
+
import collate._abcollator
import collate._locale
-__all__ = ["Collate"]
-
class Collator(collate._abcollator.Collator):
"""Codepoint-based collation.
"""
def __init__(self, locale=None, encoding=None):
+ super(Collator, self).__init__(locale, encoding)
dummy, self.encoding = collate._locale.getpair(locale, encoding)
self.locale = "C"
-
- def key(self, string):
- """Sort key for a string.
-
- If string is a str instance, it is first decoded according to
- the 'encoding' attribute of the Collator.
- """
- if isinstance(string, str):
- string = string.decode(self.encoding, 'replace')
- return string
+"""Exception types for the collate module."""
+
class InvalidLocaleError(ValueError):
- pass
+ """Raised when an invalid locale is given to a function.
+
+ Attributes:
+ locale - the attempted invalid locale
+ """
+ def __init__(self, locale, string=""):
+ self.locale = locale
+ ValueError.__init__(self, string or locale)
"""
+__all__ = ["Collator"]
+
import collate._abcollator
import collate._locale
import collate.errors
"""ICU-based collation."""
def __init__(self, locale, encoding=None):
+ super(Collator, self).__init__(locale, encoding)
locale, encoding = collate._locale.getpair(locale, encoding)
icu_locale = "root" if locale == "C" else locale
self._collator = _icu.Collator(icu_locale)
string = string.decode(self.encoding, 'replace')
return self._collator.key(string)
- def cmp(self, a, b):
- """Return negative if a < b, zero if a == b, positive if a > b.
-
- If strs rather than unicodes are passed in, they are first
- decoded according to the 'encoding' attribute of the Collator.
- """
- if isinstance(a, str):
- a = a.decode(self.encoding, 'replace')
- if isinstance(b, str):
- b = b.decode(self.encoding, 'replace')
- return self._collator.cmp(a, b)
"""
+__all__ = ["Collator"]
+
import locale
import re
"""C library locale-based collation."""
def __init__(self, locale_code, encoding=None):
+ super(Collator, self).__init__(locale, encoding)
locale_code, encoding = collate._locale.getpair(locale_code, encoding)
try:
setlocale = locale_code + "." + encoding
except UnicodeEncodeError:
return locale.strxfrm(string.encode(self.encoding, "replace"))
- def cmp(self, a, b):
- """Return negative if a < b, zero if a == b, positive if a > b.
-
- If strs rather than unicodes are passed in, they are first
- decoded according to the 'encoding' attribute of the Collator.
- """
- if isinstance(a, str):
- a = a.decode(self.encoding, "replace")
- if isinstance(b, str):
- b = b.decode(self.encoding, "replace")
- return locale.strcoll(a, b)
-
def words(self, string, sep=re.compile(r"\W+", re.UNICODE)):
- """Split the string into separate words.
-
- This split is done using the locale's notion of a word boundry.
- """
+ """Split the string into separate words."""
+ if isinstance(string, str):
+ string = string.decode(self.encoding, 'replace')
return re.split(sep, string)
if options.backend or options.locale or options.encoding:
backend = getattr(collate, options.backend)
- collate.set_default(backend, options.locale, options.encoding)
+ collate.set_collator(backend, options.locale, options.encoding)
if options.diagnostics:
print >>sys.stderr, "Backend: " + collate.default.__name__
line = line.strip()
line = line.decode(encoding, "replace")
lines.append(line)
- lines.sort(key=collate.collator.sortemekey)
+ lines.sort(key=collate.key)
for line in lines:
print line.encode(encoding, "replace")