ICU backend; uses Pyrex, based on zope.ucol.
authorJoe Wreschnig <joe.wreschnig@gmail.com>
Thu, 11 Feb 2010 04:34:41 +0000 (20:34 -0800)
committerJoe Wreschnig <joe.wreschnig@gmail.com>
Thu, 11 Feb 2010 04:34:41 +0000 (20:34 -0800)
README.txt
ZPL.txt [new file with mode: 0644]
collate/__init__.py
collate/icu/__init__.py [new file with mode: 0644]
collate/icu/_ucol.pyx [new file with mode: 0644]
setup.py [new file with mode: 0755]

index 299bc82a05dc9902e5a13a106fba3f733aa2f5d2..629ce6045ac8b08d54a85477b8552464841e4acb 100644 (file)
@@ -88,6 +88,17 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 
+icu/_ucol.pyx:
+
+Copyright (c) 2004 Zope Corporation and Contributors.
+All Rights Reserved.
+
+This software is subject to the provisions of the Zope Public License,
+Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
+THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+FOR A PARTICULAR PURPOSE.
 
 All else:
 
diff --git a/ZPL.txt b/ZPL.txt
new file mode 100644 (file)
index 0000000..3d2c474
--- /dev/null
+++ b/ZPL.txt
@@ -0,0 +1,53 @@
+Zope Public License (ZPL) Version 2.1
+
+A copyright notice accompanies this license document that
+identifies the copyright holders.
+
+This license has been certified as open source. It has also
+been designated as GPL compatible by the Free Software
+Foundation (FSF).
+
+Redistribution and use in source and binary forms, with or
+without modification, are permitted provided that the
+following conditions are met:
+
+1. Redistributions in source code must retain the
+   accompanying copyright notice, this list of conditions,
+   and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the accompanying
+   copyright notice, this list of conditions, and the
+   following disclaimer in the documentation and/or other
+   materials provided with the distribution.
+
+3. Names of the copyright holders must not be used to
+   endorse or promote products derived from this software
+   without prior written permission from the copyright
+   holders.
+
+4. The right to distribute this software or to use it for
+   any purpose does not give you the right to use
+   Servicemarks (sm) or Trademarks (tm) of the copyright
+   holders. Use of them is covered by separate agreement
+   with the copyright holders.
+
+5. If any files are modified, you must cause the modified
+   files to carry prominent notices stating that you changed
+   the files and the date of any change.
+
+Disclaimer
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS''
+  AND ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT
+  NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+  AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN
+  NO EVENT SHALL THE COPYRIGHT HOLDERS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+  OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+  DAMAGE.
index 6d1806a0f0d11cd69a9ee3e4e5c5063eda122463..1d48178845a7b886e908087901dce2b1e0640034 100644 (file)
@@ -11,30 +11,34 @@ except ImportError:
 
 collator = None
 
-def set_locale(locale_code, strict=False):
+def set_locale(locale_code):
     global collator
 
     if collator is None or collator.locale != locale_code:
-        try:
-            collator = default.Collator(locale_code, strict)
-        except collate.errors.InvalidLocaleError:
-            if strict:
-                raise
+        for code in [locale_code,
+                     locale_code.split("_")[0],
+                     locale.getdefaultlocale()[0],
+                     locale.getdefaultlocale()[0].split("_")[0],
+                     None]:
+            try:
+                collator = default.Collator(code)
+            except collate.errors.InvalidLocaleError:
+                pass
             else:
-                default_locale = locale.getdefaultlocale()[0]
-                try:
-                    collator = default.Collator(default_locale, strict)
-                except collate.errors.InvalidLocaleError:
-                    if not collator:
-                        raise
+                break
+        else:
+            raise collate.errors.InvalidLocaleError(locale_code)
 
 def get_locale():
     return collator.locale
 
-def set_backend(backend, strict=False):
+def set_backend(backend):
     pass
 
 def key(string):
     return collator.key(string)
 
+def cmp(string1, string2):
+    return collator.cmp(string1, string2)
+
 set_locale(locale.getdefaultlocale()[0])
diff --git a/collate/icu/__init__.py b/collate/icu/__init__.py
new file mode 100644 (file)
index 0000000..16ef1e6
--- /dev/null
@@ -0,0 +1,12 @@
+import collate.icu._ucol
+import collate._abcollator
+
+class Collator(collate._abcollator.Collator):
+    def __init__(self, locale):
+       self._collator = collate.icu._ucol.Collator(locale)
+
+    def key(self, string):
+       return self._collator.key(string)
+
+    def cmp(self, string1, string2):
+       return self._collator.cmp(string1, string2)
diff --git a/collate/icu/_ucol.pyx b/collate/icu/_ucol.pyx
new file mode 100644 (file)
index 0000000..e54dd53
--- /dev/null
@@ -0,0 +1,199 @@
+##############################################################################
+#
+# Copyright (c) 2004 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+"""Simple wrapper for ICU ucol API
+
+"""
+
+import sys
+
+cdef extern from  "unicode/utypes.h":
+
+    cdef enum UErrorCode:
+        U_USING_DEFAULT_WARNING = -127
+        U_USING_FALLBACK_WARNING = -128
+        U_ZERO_ERROR = 0
+        U_ILLEGAL_ARGUMENT_ERROR = 1
+    ctypedef int int32_t
+    ctypedef char uint8_t
+    int U_FAILURE(UErrorCode status)
+
+cdef extern from  "unicode/utf.h":
+
+    ctypedef int UChar
+    ctypedef int UChar32
+
+cdef extern from  "unicode/ustring.h":
+    
+    UChar *u_strFromUTF32(UChar *dest, int32_t destCapacity,
+                          int32_t *pDestLength,
+                          UChar32 *src, int32_t srcLength,
+                          UErrorCode *status)
+
+cdef extern from  "unicode/ucol.h":
+
+    ctypedef struct UCollator:
+        pass
+    UCollator *ucol_open(char *locale, UErrorCode *status)
+    void ucol_close(UCollator *collator)
+    int32_t ucol_getSortKey(UCollator *coll,
+                            UChar *source, int32_t sourceLength,
+                            uint8_t *result,
+                            int32_t resultLength
+                            )
+    int ucol_strcoll(UCollator *coll,
+                     UChar *source, int32_t sourceLength,
+                     UChar *target, int32_t targetLength)
+
+cdef extern from  "Python.h":
+
+    int PyUnicode_Check(ob)
+    int PyString_Check(ob)
+
+    ctypedef int Py_UNICODE
+    Py_UNICODE *PyUnicode_AS_UNICODE(ob)
+    int PyUnicode_GET_SIZE(ob)
+    char *PyString_AS_STRING(ob)
+
+    void *PyMem_Malloc(int size)
+    void PyMem_Free(void *p)
+    object PyString_FromStringAndSize(char *v, int l)
+    
+    
+cdef class UCharString:
+    """Wrapper for ICU UChar arrays
+    """
+
+    cdef UChar *data
+    cdef readonly int32_t length
+    cdef readonly object base
+    cdef readonly int need_to_free
+
+    def __cinit__(self, text):
+        cdef int32_t buffsize
+        cdef UErrorCode status
+        cdef Py_UNICODE *str
+        cdef int length
+
+        if not PyUnicode_Check(text):
+            if PyString_Check(text):
+                text = unicode(text)
+                assert PyUnicode_Check(text)
+            else:
+                raise TypeError("Expected unicode string")
+
+        length = PyUnicode_GET_SIZE(text)
+        str = PyUnicode_AS_UNICODE(text)
+        
+
+        if sizeof(Py_UNICODE) == 2:
+            self.data = str
+            self.length = length
+            self.base = text
+            self.need_to_free = 0
+        else:
+            buffsize = 2*length + 1
+            self.data = <UChar*>PyMem_Malloc(buffsize*sizeof(UChar))
+            if self.data == NULL:
+                raise MemoryError
+            status = U_ZERO_ERROR
+            u_strFromUTF32(self.data, buffsize, &(self.length),
+                           <UChar32*>str, length, &status)
+            assert self.length <= buffsize
+            self.need_to_free = 1
+            if U_FAILURE(status):
+                raise ValueError(
+                    "Couldn't convert Python unicode data to ICU unicode data."
+                    )
+
+    def __dealloc__(self):
+        if self.need_to_free and self.data != NULL:
+            PyMem_Free(self.data)
+            self.data = NULL
+
+
+cdef class Collator:
+    """Compute a collation key for a unicode string.
+    """
+
+    cdef UCollator *collator
+    cdef readonly object locale
+    cdef readonly int used_default_information
+
+    def __cinit__(self, locale):
+        cdef UCollator *collator
+        cdef UErrorCode status
+
+        if not PyString_Check(locale):
+            raise TypeError("String locale expected")
+        
+        status = U_ZERO_ERROR
+        collator = ucol_open(PyString_AS_STRING(locale), &status)
+        if U_FAILURE(status):
+            raise ValueError("Couldn't create a collator")
+        self.collator = collator
+        self.locale = locale
+        if (status == U_USING_DEFAULT_WARNING
+            or status == U_USING_FALLBACK_WARNING):
+            status = U_ILLEGAL_ARGUMENT_ERROR
+        self.used_default_information = status
+
+    def __dealloc__(self):
+        if self.collator != NULL:
+            ucol_close(self.collator)
+
+    def key(self, text):
+        """Compute a collation key for the given unicode text.
+
+        Of course, the key is only valid for the given locale.
+        """
+        cdef char *buffer
+        cdef int32_t bufsize
+        cdef int32_t size
+
+        icutext = UCharString(text)
+        bufsize = (<UCharString>icutext).length*2+10
+
+        # the +1 below is needed to avoid an apprent buffer overflow bug in ICU
+        buffer = <char*>PyMem_Malloc(bufsize +1)
+        if buffer == NULL:
+            raise MemoryError
+        size = ucol_getSortKey(self.collator,
+                               (<UCharString>icutext).data,
+                               (<UCharString>icutext).length,
+                               <uint8_t*>buffer, bufsize)
+        while size > bufsize:
+            bufsize = size
+            PyMem_Free(buffer)
+            buffer = <char*>PyMem_Malloc(bufsize +1) # See above +1
+            if buffer == NULL:
+                raise MemoryError
+            size = ucol_getSortKey(self.collator,
+                                   (<UCharString>icutext).data,
+                                   (<UCharString>icutext).length,
+                                   <uint8_t*>buffer, bufsize)
+
+        result = PyString_FromStringAndSize(buffer, size)
+        PyMem_Free(buffer)
+        return result
+
+    def cmp(self, o1, o2):
+        u1 = UCharString(o1)
+        u2 = UCharString(o2)
+        return ucol_strcoll(
+            self.collator,
+            (<UCharString>u1).data,
+            (<UCharString>u1).length,
+            (<UCharString>u2).data,
+            (<UCharString>u2).length,
+            )
diff --git a/setup.py b/setup.py
new file mode 100755 (executable)
index 0000000..cf14037
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python
+
+import sys
+
+from distutils.core import setup, Extension
+from Pyrex.Distutils import build_ext
+
+if sys.platform.startswith('win'):
+    libraries = ['icuin', 'icuuc', 'icudt']
+else:
+    libraries = ['icui18n', 'icuuc', 'icudata']
+
+setup(name='collate',
+      version='0',
+      author="Joe Wreschnig",
+      author_email="joe.wreschnig@gmail.com",
+      description="Python text collation",
+      license="MIT / ZPL 2.1",
+      ext_modules=[
+          Extension('collate.icu._ucol',
+                    ['collate/icu/_ucol.pyx'],
+                    libraries=libraries)],
+      cmdclass=dict(build_ext=build_ext),
+      packages=["collate", "collate.icu", "collate.uca", "collate.syslocale"],
+      )