From 7644110ce07ec8a78003ee7db9dcdfe5cbca3854 Mon Sep 17 00:00:00 2001 From: Joe Wreschnig Date: Mon, 22 Feb 2010 03:18:36 -0800 Subject: [PATCH] Cleanup in preparation for release. Add docstrings, remove basically empty _constants module. --- MANIFEST.in | 5 +++ collate/__init__.py | 84 +++++++++++++++++++++++++++++++++++++---- collate/_abcollator.py | 30 +++++++++++++-- collate/_constants.py | 1 - collate/_locale.py | 5 ++- collate/codepoint.py | 15 ++------ collate/errors.py | 11 +++++- collate/icu/__init__.py | 14 ++----- collate/syslocale.py | 22 +++-------- pycollate | 4 +- 10 files changed, 136 insertions(+), 55 deletions(-) create mode 100644 MANIFEST.in delete mode 100644 collate/_constants.py diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..11aaf94 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,5 @@ +include README.txt +include tests/*/*.list.txt +include tests/*/*.py +include ZPL.txt +include pycollate diff --git a/collate/__init__.py b/collate/__init__.py index 7e89fe6..52d9e03 100644 --- a/collate/__init__.py +++ b/collate/__init__.py @@ -1,3 +1,35 @@ +"""collate - Sort strings intelligently. + +This module provides tools to sort strings in a 'human-expected' order. + +Because human expectations are fuzzy and often self-contradictory, the +sort order is not guaranteed to be stable between versions of this +module (rather the opposite - the primary reason to update it will +probably be changed sort results). + +If available, this module uses the ICU localization library. +Otherwise, it uses the system's locale database (and produces +significantly worse results). + +Trivial Use: +------------ +strings = read_strings(...) +strings.sort(key=collate.key) + +Attributes: +----------- +backend - The default collation backend. If available, this is + collate.icu; otherwise, it is collate.syslocale. In special + situations, it may be collate.codepoint. + +collate - The default collator. This is the collator of the default + backend instantiated with the default system locale and encoding. + +""" + +__all__ = ["collator", "set_collator", "cmp", "key", "default", + "VERSION", "VERSION_STRING"] + import collate.errors import collate._locale @@ -14,32 +46,70 @@ try: except ImportError: pass +VERSION = (0, 1) +VERSION_STRING = ".".join(map(str, VERSION)) + collator = None -def set_default(backend=None, locale=None, encoding=None): +def set_collator(backend=None, locale=None, encoding=None): + """Set the default collation backend. + + This function tries very hard not to fail; the resulting Collator + may not have the locale or encoding you specified (at the very + least, they will be normalized). Remember to check + collator.locale and collator.encoding. + + Arguments: + backend - 'icu', 'syslocale', or 'codepoint'; None to not change. + locale - e.g. 'en_US', or None for the system locale. + encoding - e.g. 'utf-8', or None for the system locale encoding + + Returns: + The new default Collator instance, or None if no collator could be + created; if None is returned, the existing default Collator is + left intact. + + """ global collator global default if backend is None: backend = default locales = collate._locale.localelist(locale) - c = None + possible = None for locale in locales: locale, encoding_ = collate._locale.getpair(locale, encoding) try: - c = backend.Collator(locale, encoding_) + possible = backend.Collator(locale, encoding_) except collate.errors.InvalidLocaleError: pass else: break - if c is not None: - collator = c + if possible is not None: + collator = possible default = backend + return collator + return possible def key(string): - return collator.key(string) + """Return a good sorting key for the string. + + The sort key should be considered an opaque value which is only + meaningful when compared to other sort keys from the same + collator. + + This is the same as collate.collator.sortemekey(string). + """ + return collator.sortemekey(string) def cmp(a, b): + """Return negative if a < b, zero if a == b, positive if a > b. + + This is the same as collate.collator.cmp(a, b). + """ return collator.cmp(a, b) -set_default() +set_collator() + +if collator is None: + raise collate.errors.InvalidLocaleError("C") diff --git a/collate/_abcollator.py b/collate/_abcollator.py index 12575ea..622766d 100644 --- a/collate/_abcollator.py +++ b/collate/_abcollator.py @@ -1,13 +1,34 @@ +"""Abstract base collator.""" + import collate.strings class Collator(object): + """Abstract base class for Collators. + + Attributes: + locale - the collator follows rules for this locale + encoding - assumed string encoding + """ + + locale = "C" encoding = "ascii" + def __init__(self, locale=None, encoding=None): + pass + def cmp(self, string1, string2): """Return negative if a < b, zero if a == b, positive if a > b.""" return cmp(self.key(string1), self.key(string2)) def key(self, string): + """Return a good sorting key for the string. + + The sort key should be considered an opaque value which is + only meaningful when compared to other sort keys from the same + collator. + """ + if isinstance(string, str): + string = string.decode(self.encoding, 'replace') return string def words(self, string): @@ -19,9 +40,12 @@ class Collator(object): def sortemekey(self, string): """Return a key based on sortemes of a string. - If the string is a str instance, it is decoded to a unicode - instance according to the 'encoding' attribute of the - Collator. + A sorteme, by analogy with grapheme/morpheme/etc. is an atom + of sort information. This is larger than a word boundry but + smaller than a sentence boundry; roughly, a sorteme boundry + occurs between letters and numbers, between numbers and + numbers if 'too much' punctuation exists in between, between + lines. """ if isinstance(string, str): string = string.decode(self.encoding, 'replace') diff --git a/collate/_constants.py b/collate/_constants.py deleted file mode 100644 index eb1786c..0000000 --- a/collate/_constants.py +++ /dev/null @@ -1 +0,0 @@ -VERSION = 0.1 diff --git a/collate/_locale.py b/collate/_locale.py index 70c085a..d20e184 100644 --- a/collate/_locale.py +++ b/collate/_locale.py @@ -1,5 +1,7 @@ """Locale utility routines.""" +__all__ = ["localelist", "encoding", "getpair"] + import sys try: @@ -12,8 +14,6 @@ try: except ImportError: codecs = None -__all__ = ["localelist"] - def localelist(*locales): """Normalize and return a list of locales, with appended defaults. @@ -114,6 +114,7 @@ def encoding(preferred=None): return locale.getpreferredencoding() or sys.getdefaultencoding() def getpair(locale_, encoding_): + """If encoding_ is unspecified, get the locale encoding.""" if "." in locale_: if encoding_ is None: locale_, encoding_ = locale_.rsplit(".", 1) diff --git a/collate/codepoint.py b/collate/codepoint.py index e0bbcbd..9987933 100644 --- a/collate/codepoint.py +++ b/collate/codepoint.py @@ -15,11 +15,11 @@ Avoid this backend if... """ +__all__ = ["Collator"] + import collate._abcollator import collate._locale -__all__ = ["Collate"] - class Collator(collate._abcollator.Collator): """Codepoint-based collation. @@ -29,15 +29,6 @@ class Collator(collate._abcollator.Collator): """ def __init__(self, locale=None, encoding=None): + super(Collator, self).__init__(locale, encoding) dummy, self.encoding = collate._locale.getpair(locale, encoding) self.locale = "C" - - def key(self, string): - """Sort key for a string. - - If string is a str instance, it is first decoded according to - the 'encoding' attribute of the Collator. - """ - if isinstance(string, str): - string = string.decode(self.encoding, 'replace') - return string diff --git a/collate/errors.py b/collate/errors.py index cef5c07..ca8eec7 100644 --- a/collate/errors.py +++ b/collate/errors.py @@ -1,2 +1,11 @@ +"""Exception types for the collate module.""" + class InvalidLocaleError(ValueError): - pass + """Raised when an invalid locale is given to a function. + + Attributes: + locale - the attempted invalid locale + """ + def __init__(self, locale, string=""): + self.locale = locale + ValueError.__init__(self, string or locale) diff --git a/collate/icu/__init__.py b/collate/icu/__init__.py index 5f3ec05..e7dfe2d 100644 --- a/collate/icu/__init__.py +++ b/collate/icu/__init__.py @@ -11,6 +11,8 @@ Avoid this backend if... """ +__all__ = ["Collator"] + import collate._abcollator import collate._locale import collate.errors @@ -21,6 +23,7 @@ class Collator(collate._abcollator.Collator): """ICU-based collation.""" def __init__(self, locale, encoding=None): + super(Collator, self).__init__(locale, encoding) locale, encoding = collate._locale.getpair(locale, encoding) icu_locale = "root" if locale == "C" else locale self._collator = _icu.Collator(icu_locale) @@ -54,14 +57,3 @@ class Collator(collate._abcollator.Collator): string = string.decode(self.encoding, 'replace') return self._collator.key(string) - def cmp(self, a, b): - """Return negative if a < b, zero if a == b, positive if a > b. - - If strs rather than unicodes are passed in, they are first - decoded according to the 'encoding' attribute of the Collator. - """ - if isinstance(a, str): - a = a.decode(self.encoding, 'replace') - if isinstance(b, str): - b = b.decode(self.encoding, 'replace') - return self._collator.cmp(a, b) diff --git a/collate/syslocale.py b/collate/syslocale.py index e2aeed9..5b8adca 100644 --- a/collate/syslocale.py +++ b/collate/syslocale.py @@ -22,6 +22,8 @@ Avoid this backend if... """ +__all__ = ["Collator"] + import locale import re @@ -33,6 +35,7 @@ class Collator(collate._abcollator.Collator): """C library locale-based collation.""" def __init__(self, locale_code, encoding=None): + super(Collator, self).__init__(locale, encoding) locale_code, encoding = collate._locale.getpair(locale_code, encoding) try: setlocale = locale_code + "." + encoding @@ -54,22 +57,9 @@ class Collator(collate._abcollator.Collator): except UnicodeEncodeError: return locale.strxfrm(string.encode(self.encoding, "replace")) - def cmp(self, a, b): - """Return negative if a < b, zero if a == b, positive if a > b. - - If strs rather than unicodes are passed in, they are first - decoded according to the 'encoding' attribute of the Collator. - """ - if isinstance(a, str): - a = a.decode(self.encoding, "replace") - if isinstance(b, str): - b = b.decode(self.encoding, "replace") - return locale.strcoll(a, b) - def words(self, string, sep=re.compile(r"\W+", re.UNICODE)): - """Split the string into separate words. - - This split is done using the locale's notion of a word boundry. - """ + """Split the string into separate words.""" + if isinstance(string, str): + string = string.decode(self.encoding, 'replace') return re.split(sep, string) diff --git a/pycollate b/pycollate index e38e91d..4748b4b 100755 --- a/pycollate +++ b/pycollate @@ -33,7 +33,7 @@ def main(argv): if options.backend or options.locale or options.encoding: backend = getattr(collate, options.backend) - collate.set_default(backend, options.locale, options.encoding) + collate.set_collator(backend, options.locale, options.encoding) if options.diagnostics: print >>sys.stderr, "Backend: " + collate.default.__name__ @@ -54,7 +54,7 @@ def main(argv): line = line.strip() line = line.decode(encoding, "replace") lines.append(line) - lines.sort(key=collate.collator.sortemekey) + lines.sort(key=collate.key) for line in lines: print line.encode(encoding, "replace") -- 2.20.1