From: Joe Wreschnig Date: Fri, 12 Feb 2010 10:06:56 +0000 (-0800) Subject: Raw codepoint test backend. Tweaks to constructor arguments. Try to handle the case... X-Git-Url: https://git.korewanetadesu.com/?p=python-collate.git;a=commitdiff_plain;h=f7fd328bfc2886f6aed2c09b84cc1e039c7c3240;ds=sidebyside Raw codepoint test backend. Tweaks to constructor arguments. Try to handle the case when no locale or string codec information is available.. --- diff --git a/collate/__init__.py b/collate/__init__.py index 6ce1114..0f50170 100644 --- a/collate/__init__.py +++ b/collate/__init__.py @@ -1,67 +1,47 @@ -import locale - -if locale.getlocale()[0] is None: - locale.setlocale(locale.LC_COLLATE, '') - import collate.errors +import collate._locale +try: + import collate.codepoint as default +except ImportError: + pass try: import collate.syslocale as default +except ImportError: + pass +try: import collate.uca as default +except ImportError: + pass +try: import collate.icu as default except ImportError: pass collator = None -preferred_locale = None - -def _get_collator(backend, locale_code): - for code in [locale_code, - locale_code.split("_")[0], - locale.getlocale(locale.LC_COLLATE)[0], - locale.getlocale(locale.LC_COLLATE)[0].split("_")[0], - locale.getdefaultlocale()[0], - locale.getdefaultlocale()[0].split("_")[0], - None]: - try: - return default.Collator(code) - except collate.errors.InvalidLocaleError: - pass -def set_locale(locale_code): - global collator - global preferred_locale - - preferred_locale = locale_code - if collator is None or collator.locale != locale_code: - c = _get_collator(default, locale_code) - if c is not None: - collator = c - else: - raise collate.errors.InvalidLocaleError(locale_code) - -def set_backend(backend): +def set_default(backend=None, locale=None, encoding=None): global collator global default - c = _get_collator(backend, preferred_locale) + if backend is None: + backend = default + locales = collate._locale.localelist(locale) + c = None + for locale in locales: + locale, encoding_ = collate._locale.getpair(locale, encoding) + try: + c = backend.Collator(locale, encoding_) + except collate.errors.InvalidLocaleError: + pass if c is not None: - collator = c - default = backend - else: - raise collate.errors.InvalidLocaleError(locale_code) + collator = c + default = backend def key(string): return collator.key(string) -def cmp(string1, string2): - return collator.cmp(string1, string2) +def cmp(a, b): + return collator.cmp(a, b) -try: - set_locale(locale.getlocale()[0]) -except collate.errors.InvalidLocaleError: - # There's no way this should fail unless the C locale system is - # fucked or missing all data. - import collator.syslocale - set_backend(collator.syslocale) - set_locale(locale.getlocale(locale.LC_COLLATE)[0]) +set_default() diff --git a/collate/_locale.py b/collate/_locale.py new file mode 100644 index 0000000..bebbc95 --- /dev/null +++ b/collate/_locale.py @@ -0,0 +1,123 @@ +"""Locale utility routines.""" + +import sys + +try: + import locale +except ImportError: + locale = None + +try: + import codecs +except ImportError: + codecs = None + +__all__ = ["localelist"] + +def localelist(*locales): + """Normalize and return a list of locales, with appended defaults. + + e.g. on a system with en_US as the default locale, + + localelist('en_GB.utf8', 'de_DE') => + ['en_GB', 'en', 'de_DE', 'de', 'en_US', 'C'] + + """ + + locales = list(locales) + + if locale is not None: + # Set the locale if it hasn't already been set, but don't fail + # if we can't set it for some reason. + if locale.getlocale(locale.LC_COLLATE)[0] is None: + try: + locale.setlocale(locale.LC_COLLATE, '') + except locale.Error: + pass + + # Throw in the user's specified collation locale, the current locale, + # the default locale, and POSIX, for free. + locales.append(locale.getlocale(locale.LC_COLLATE)[0]) + locales.append(locale.getlocale()[0]) + locales.append(locale.getdefaultlocale()[0]) + locales.append("C") + + # Don't put the same locale in the return list more than twice. + added = set() + retlist = [] + + for code in locales: + if not code: + continue + if locale is not None: + code = locale.normalize(code) + # Strip off encoding if present. + code = code.split(".")[0] + if code.lower() not in added: + retlist.append(code) + added.add(code.lower()) + # Strip off territory if present. + code = code.split("_")[0] + if code.lower() not in added: + retlist.append(code) + added.add(code.lower()) + + return retlist + +def encoding(preferred=None): + """Try to find an optimal encoding. + + Arguments: + preferred - use this encoding if possible + + Otherwise, the locale encoding or the Python system encoding are + used. + """ + # can't use any codecs, use the system one (ascii). + if codecs is None: + return sys.getdefaultencoding() + + # if preferred is a valid codec, use it. + if preferred is not None: + try: + return codecs.lookup(preferred).name + except (LookupError, AttributeError): + pass + + # preferred is bad and can't get it from locale. + if locale is None: + return sys.getdefaultencoding() + + # try to get it from the locale, if not there, set it and try again. + fromlocale = locale.getlocale(locale.LC_COLLATE)[1] + if fromlocale is not None: + return fromlocale + try: + locale.setlocale(locale.LC_COLLATE, '')[1] + except locale.Error: + pass + else: + fromlocale = locale.getlocale(locale.LC_COLLATE) + if fromlocale is not None: + return fromlocale + + # okay, LC_COLLATE isn't set, maybe the generic locale is. + fromlocale = locale.getlocale()[1] + if fromlocale is not None: + return fromlocale + + # but we won't reset the generic locale if it isn't, that'd be + # rude. + + # if the locale can't even give us a simple encoding, go back + # to the system one, and give up. + return locale.getpreferredencoding() or sys.getdefaultencoding() + +def getpair(locale_, encoding_): + if "." in locale_: + if encoding_ is None: + locale_, encoding_ = locale_.rsplit(".", 1) + else: + locale_ = locale_.rsplit(".")[0] + return locale_, encoding(encoding_) + diff --git a/collate/codepoint.py b/collate/codepoint.py new file mode 100644 index 0000000..e0bbcbd --- /dev/null +++ b/collate/codepoint.py @@ -0,0 +1,43 @@ +"""Codepoint-based collation. + +This collation backend sorts using only the basic codepoint order. It +is primarily intended to be used as a baseline and example for other +collation backends. + +Use this collation backend if... + - You are writing tests for pycollate. + - You are writing specialized Unicode software. + - You are on a system with no locale module. + +Avoid this backend if... + - You are writing a normal program for a normal runtime environment. + - You are sorting strings to show normal humans. + +""" + +import collate._abcollator +import collate._locale + +__all__ = ["Collate"] + +class Collator(collate._abcollator.Collator): + """Codepoint-based collation. + + Arguments + locale - all parts but encoding ignored, always 'C' + encoding - try to use this string encoding + """ + + def __init__(self, locale=None, encoding=None): + dummy, self.encoding = collate._locale.getpair(locale, encoding) + self.locale = "C" + + def key(self, string): + """Sort key for a string. + + If string is a str instance, it is first decoded according to + the 'encoding' attribute of the Collator. + """ + if isinstance(string, str): + string = string.decode(self.encoding, 'replace') + return string diff --git a/collate/icu/__init__.py b/collate/icu/__init__.py index 1e2ba50..bb0dcd8 100644 --- a/collate/icu/__init__.py +++ b/collate/icu/__init__.py @@ -4,7 +4,7 @@ import collate._abcollator NAME = "ICU" class Collator(collate._abcollator.Collator): - def __init__(self, locale): + def __init__(self, locale, encoding): self._collator = collate.icu._ucol.Collator(locale) self.locale = self._collator.locale diff --git a/collate/syslocale/__init__.py b/collate/syslocale/__init__.py index 43281e2..a76d538 100644 --- a/collate/syslocale/__init__.py +++ b/collate/syslocale/__init__.py @@ -4,7 +4,7 @@ import collate.errors import collate._abcollator class Collator(collate._abcollator.Collator): - def __init__(self, locale_code): + def __init__(self, locale_code, encoding=None): default = locale.getdefaultlocale()[0] for locale in [locale_code, default]: try: diff --git a/collate/uca/__init__.py b/collate/uca/__init__.py index 228a24c..7ddf731 100644 --- a/collate/uca/__init__.py +++ b/collate/uca/__init__.py @@ -30,7 +30,7 @@ class Trie(object): class Collator(collate._abcollator.Collator): - def __init__(self, locale_code, strict=False): + def __init__(self, locale_code, encoding=None): self.__table = Trie() self.locale = locale_code