From c02c5c3c54d35e7d5836adf54aadac1f79906f05 Mon Sep 17 00:00:00 2001 From: Joe Wreschnig Date: Wed, 10 Feb 2010 20:34:41 -0800 Subject: [PATCH 1/1] ICU backend; uses Pyrex, based on zope.ucol. --- README.txt | 11 +++ ZPL.txt | 53 +++++++++++ collate/__init__.py | 30 +++--- collate/icu/__init__.py | 12 +++ collate/icu/_ucol.pyx | 199 ++++++++++++++++++++++++++++++++++++++++ setup.py | 25 +++++ 6 files changed, 317 insertions(+), 13 deletions(-) create mode 100644 ZPL.txt create mode 100644 collate/icu/__init__.py create mode 100644 collate/icu/_ucol.pyx create mode 100755 setup.py diff --git a/README.txt b/README.txt index 299bc82..629ce60 100644 --- a/README.txt +++ b/README.txt @@ -88,6 +88,17 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +icu/_ucol.pyx: + +Copyright (c) 2004 Zope Corporation and Contributors. +All Rights Reserved. + +This software is subject to the provisions of the Zope Public License, +Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution. +THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED +WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS +FOR A PARTICULAR PURPOSE. All else: diff --git a/ZPL.txt b/ZPL.txt new file mode 100644 index 0000000..3d2c474 --- /dev/null +++ b/ZPL.txt @@ -0,0 +1,53 @@ +Zope Public License (ZPL) Version 2.1 + +A copyright notice accompanies this license document that +identifies the copyright holders. + +This license has been certified as open source. It has also +been designated as GPL compatible by the Free Software +Foundation (FSF). + +Redistribution and use in source and binary forms, with or +without modification, are permitted provided that the +following conditions are met: + +1. Redistributions in source code must retain the + accompanying copyright notice, this list of conditions, + and the following disclaimer. + +2. Redistributions in binary form must reproduce the accompanying + copyright notice, this list of conditions, and the + following disclaimer in the documentation and/or other + materials provided with the distribution. + +3. Names of the copyright holders must not be used to + endorse or promote products derived from this software + without prior written permission from the copyright + holders. + +4. The right to distribute this software or to use it for + any purpose does not give you the right to use + Servicemarks (sm) or Trademarks (tm) of the copyright + holders. Use of them is covered by separate agreement + with the copyright holders. + +5. If any files are modified, you must cause the modified + files to carry prominent notices stating that you changed + the files and the date of any change. + +Disclaimer + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' + AND ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT + NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN + NO EVENT SHALL THE COPYRIGHT HOLDERS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + DAMAGE. diff --git a/collate/__init__.py b/collate/__init__.py index 6d1806a..1d48178 100644 --- a/collate/__init__.py +++ b/collate/__init__.py @@ -11,30 +11,34 @@ except ImportError: collator = None -def set_locale(locale_code, strict=False): +def set_locale(locale_code): global collator if collator is None or collator.locale != locale_code: - try: - collator = default.Collator(locale_code, strict) - except collate.errors.InvalidLocaleError: - if strict: - raise + for code in [locale_code, + locale_code.split("_")[0], + locale.getdefaultlocale()[0], + locale.getdefaultlocale()[0].split("_")[0], + None]: + try: + collator = default.Collator(code) + except collate.errors.InvalidLocaleError: + pass else: - default_locale = locale.getdefaultlocale()[0] - try: - collator = default.Collator(default_locale, strict) - except collate.errors.InvalidLocaleError: - if not collator: - raise + break + else: + raise collate.errors.InvalidLocaleError(locale_code) def get_locale(): return collator.locale -def set_backend(backend, strict=False): +def set_backend(backend): pass def key(string): return collator.key(string) +def cmp(string1, string2): + return collator.cmp(string1, string2) + set_locale(locale.getdefaultlocale()[0]) diff --git a/collate/icu/__init__.py b/collate/icu/__init__.py new file mode 100644 index 0000000..16ef1e6 --- /dev/null +++ b/collate/icu/__init__.py @@ -0,0 +1,12 @@ +import collate.icu._ucol +import collate._abcollator + +class Collator(collate._abcollator.Collator): + def __init__(self, locale): + self._collator = collate.icu._ucol.Collator(locale) + + def key(self, string): + return self._collator.key(string) + + def cmp(self, string1, string2): + return self._collator.cmp(string1, string2) diff --git a/collate/icu/_ucol.pyx b/collate/icu/_ucol.pyx new file mode 100644 index 0000000..e54dd53 --- /dev/null +++ b/collate/icu/_ucol.pyx @@ -0,0 +1,199 @@ +############################################################################## +# +# Copyright (c) 2004 Zope Corporation and Contributors. +# All Rights Reserved. +# +# This software is subject to the provisions of the Zope Public License, +# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution. +# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED +# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS +# FOR A PARTICULAR PURPOSE. +# +############################################################################## +"""Simple wrapper for ICU ucol API + +""" + +import sys + +cdef extern from "unicode/utypes.h": + + cdef enum UErrorCode: + U_USING_DEFAULT_WARNING = -127 + U_USING_FALLBACK_WARNING = -128 + U_ZERO_ERROR = 0 + U_ILLEGAL_ARGUMENT_ERROR = 1 + ctypedef int int32_t + ctypedef char uint8_t + int U_FAILURE(UErrorCode status) + +cdef extern from "unicode/utf.h": + + ctypedef int UChar + ctypedef int UChar32 + +cdef extern from "unicode/ustring.h": + + UChar *u_strFromUTF32(UChar *dest, int32_t destCapacity, + int32_t *pDestLength, + UChar32 *src, int32_t srcLength, + UErrorCode *status) + +cdef extern from "unicode/ucol.h": + + ctypedef struct UCollator: + pass + UCollator *ucol_open(char *locale, UErrorCode *status) + void ucol_close(UCollator *collator) + int32_t ucol_getSortKey(UCollator *coll, + UChar *source, int32_t sourceLength, + uint8_t *result, + int32_t resultLength + ) + int ucol_strcoll(UCollator *coll, + UChar *source, int32_t sourceLength, + UChar *target, int32_t targetLength) + +cdef extern from "Python.h": + + int PyUnicode_Check(ob) + int PyString_Check(ob) + + ctypedef int Py_UNICODE + Py_UNICODE *PyUnicode_AS_UNICODE(ob) + int PyUnicode_GET_SIZE(ob) + char *PyString_AS_STRING(ob) + + void *PyMem_Malloc(int size) + void PyMem_Free(void *p) + object PyString_FromStringAndSize(char *v, int l) + + +cdef class UCharString: + """Wrapper for ICU UChar arrays + """ + + cdef UChar *data + cdef readonly int32_t length + cdef readonly object base + cdef readonly int need_to_free + + def __cinit__(self, text): + cdef int32_t buffsize + cdef UErrorCode status + cdef Py_UNICODE *str + cdef int length + + if not PyUnicode_Check(text): + if PyString_Check(text): + text = unicode(text) + assert PyUnicode_Check(text) + else: + raise TypeError("Expected unicode string") + + length = PyUnicode_GET_SIZE(text) + str = PyUnicode_AS_UNICODE(text) + + + if sizeof(Py_UNICODE) == 2: + self.data = str + self.length = length + self.base = text + self.need_to_free = 0 + else: + buffsize = 2*length + 1 + self.data = PyMem_Malloc(buffsize*sizeof(UChar)) + if self.data == NULL: + raise MemoryError + status = U_ZERO_ERROR + u_strFromUTF32(self.data, buffsize, &(self.length), + str, length, &status) + assert self.length <= buffsize + self.need_to_free = 1 + if U_FAILURE(status): + raise ValueError( + "Couldn't convert Python unicode data to ICU unicode data." + ) + + def __dealloc__(self): + if self.need_to_free and self.data != NULL: + PyMem_Free(self.data) + self.data = NULL + + +cdef class Collator: + """Compute a collation key for a unicode string. + """ + + cdef UCollator *collator + cdef readonly object locale + cdef readonly int used_default_information + + def __cinit__(self, locale): + cdef UCollator *collator + cdef UErrorCode status + + if not PyString_Check(locale): + raise TypeError("String locale expected") + + status = U_ZERO_ERROR + collator = ucol_open(PyString_AS_STRING(locale), &status) + if U_FAILURE(status): + raise ValueError("Couldn't create a collator") + self.collator = collator + self.locale = locale + if (status == U_USING_DEFAULT_WARNING + or status == U_USING_FALLBACK_WARNING): + status = U_ILLEGAL_ARGUMENT_ERROR + self.used_default_information = status + + def __dealloc__(self): + if self.collator != NULL: + ucol_close(self.collator) + + def key(self, text): + """Compute a collation key for the given unicode text. + + Of course, the key is only valid for the given locale. + """ + cdef char *buffer + cdef int32_t bufsize + cdef int32_t size + + icutext = UCharString(text) + bufsize = (icutext).length*2+10 + + # the +1 below is needed to avoid an apprent buffer overflow bug in ICU + buffer = PyMem_Malloc(bufsize +1) + if buffer == NULL: + raise MemoryError + size = ucol_getSortKey(self.collator, + (icutext).data, + (icutext).length, + buffer, bufsize) + while size > bufsize: + bufsize = size + PyMem_Free(buffer) + buffer = PyMem_Malloc(bufsize +1) # See above +1 + if buffer == NULL: + raise MemoryError + size = ucol_getSortKey(self.collator, + (icutext).data, + (icutext).length, + buffer, bufsize) + + result = PyString_FromStringAndSize(buffer, size) + PyMem_Free(buffer) + return result + + def cmp(self, o1, o2): + u1 = UCharString(o1) + u2 = UCharString(o2) + return ucol_strcoll( + self.collator, + (u1).data, + (u1).length, + (u2).data, + (u2).length, + ) diff --git a/setup.py b/setup.py new file mode 100755 index 0000000..cf14037 --- /dev/null +++ b/setup.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python + +import sys + +from distutils.core import setup, Extension +from Pyrex.Distutils import build_ext + +if sys.platform.startswith('win'): + libraries = ['icuin', 'icuuc', 'icudt'] +else: + libraries = ['icui18n', 'icuuc', 'icudata'] + +setup(name='collate', + version='0', + author="Joe Wreschnig", + author_email="joe.wreschnig@gmail.com", + description="Python text collation", + license="MIT / ZPL 2.1", + ext_modules=[ + Extension('collate.icu._ucol', + ['collate/icu/_ucol.pyx'], + libraries=libraries)], + cmdclass=dict(build_ext=build_ext), + packages=["collate", "collate.icu", "collate.uca", "collate.syslocale"], + ) -- 2.20.1