ICU backend; uses Pyrex, based on zope.ucol.
authorJoe Wreschnig <joe.wreschnig@gmail.com>
Thu, 11 Feb 2010 04:34:41 +0000 (20:34 -0800)
committerJoe Wreschnig <joe.wreschnig@gmail.com>
Thu, 11 Feb 2010 04:34:41 +0000 (20:34 -0800)
README.txt
ZPL.txt [new file with mode: 0644]
collate/__init__.py
collate/icu/__init__.py [new file with mode: 0644]
collate/icu/_ucol.pyx [new file with mode: 0644]
setup.py [new file with mode: 0755]

index 299bc82..629ce60 100644 (file)
@@ -88,6 +88,17 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 
+icu/_ucol.pyx:
+
+Copyright (c) 2004 Zope Corporation and Contributors.
+All Rights Reserved.
+
+This software is subject to the provisions of the Zope Public License,
+Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
+THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+FOR A PARTICULAR PURPOSE.
 
 All else:
 
 
 All else:
 
diff --git a/ZPL.txt b/ZPL.txt
new file mode 100644 (file)
index 0000000..3d2c474
--- /dev/null
+++ b/ZPL.txt
@@ -0,0 +1,53 @@
+Zope Public License (ZPL) Version 2.1
+
+A copyright notice accompanies this license document that
+identifies the copyright holders.
+
+This license has been certified as open source. It has also
+been designated as GPL compatible by the Free Software
+Foundation (FSF).
+
+Redistribution and use in source and binary forms, with or
+without modification, are permitted provided that the
+following conditions are met:
+
+1. Redistributions in source code must retain the
+   accompanying copyright notice, this list of conditions,
+   and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the accompanying
+   copyright notice, this list of conditions, and the
+   following disclaimer in the documentation and/or other
+   materials provided with the distribution.
+
+3. Names of the copyright holders must not be used to
+   endorse or promote products derived from this software
+   without prior written permission from the copyright
+   holders.
+
+4. The right to distribute this software or to use it for
+   any purpose does not give you the right to use
+   Servicemarks (sm) or Trademarks (tm) of the copyright
+   holders. Use of them is covered by separate agreement
+   with the copyright holders.
+
+5. If any files are modified, you must cause the modified
+   files to carry prominent notices stating that you changed
+   the files and the date of any change.
+
+Disclaimer
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS''
+  AND ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT
+  NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+  AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN
+  NO EVENT SHALL THE COPYRIGHT HOLDERS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+  OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+  DAMAGE.
index 6d1806a..1d48178 100644 (file)
@@ -11,30 +11,34 @@ except ImportError:
 
 collator = None
 
 
 collator = None
 
-def set_locale(locale_code, strict=False):
+def set_locale(locale_code):
     global collator
 
     if collator is None or collator.locale != locale_code:
     global collator
 
     if collator is None or collator.locale != locale_code:
-        try:
-            collator = default.Collator(locale_code, strict)
-        except collate.errors.InvalidLocaleError:
-            if strict:
-                raise
+        for code in [locale_code,
+                     locale_code.split("_")[0],
+                     locale.getdefaultlocale()[0],
+                     locale.getdefaultlocale()[0].split("_")[0],
+                     None]:
+            try:
+                collator = default.Collator(code)
+            except collate.errors.InvalidLocaleError:
+                pass
             else:
             else:
-                default_locale = locale.getdefaultlocale()[0]
-                try:
-                    collator = default.Collator(default_locale, strict)
-                except collate.errors.InvalidLocaleError:
-                    if not collator:
-                        raise
+                break
+        else:
+            raise collate.errors.InvalidLocaleError(locale_code)
 
 def get_locale():
     return collator.locale
 
 
 def get_locale():
     return collator.locale
 
-def set_backend(backend, strict=False):
+def set_backend(backend):
     pass
 
 def key(string):
     return collator.key(string)
 
     pass
 
 def key(string):
     return collator.key(string)
 
+def cmp(string1, string2):
+    return collator.cmp(string1, string2)
+
 set_locale(locale.getdefaultlocale()[0])
 set_locale(locale.getdefaultlocale()[0])
diff --git a/collate/icu/__init__.py b/collate/icu/__init__.py
new file mode 100644 (file)
index 0000000..16ef1e6
--- /dev/null
@@ -0,0 +1,12 @@
+import collate.icu._ucol
+import collate._abcollator
+
+class Collator(collate._abcollator.Collator):
+    def __init__(self, locale):
+       self._collator = collate.icu._ucol.Collator(locale)
+
+    def key(self, string):
+       return self._collator.key(string)
+
+    def cmp(self, string1, string2):
+       return self._collator.cmp(string1, string2)
diff --git a/collate/icu/_ucol.pyx b/collate/icu/_ucol.pyx
new file mode 100644 (file)
index 0000000..e54dd53
--- /dev/null
@@ -0,0 +1,199 @@
+##############################################################################
+#
+# Copyright (c) 2004 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+"""Simple wrapper for ICU ucol API
+
+"""
+
+import sys
+
+cdef extern from  "unicode/utypes.h":
+
+    cdef enum UErrorCode:
+        U_USING_DEFAULT_WARNING = -127
+        U_USING_FALLBACK_WARNING = -128
+        U_ZERO_ERROR = 0
+        U_ILLEGAL_ARGUMENT_ERROR = 1
+    ctypedef int int32_t
+    ctypedef char uint8_t
+    int U_FAILURE(UErrorCode status)
+
+cdef extern from  "unicode/utf.h":
+
+    ctypedef int UChar
+    ctypedef int UChar32
+
+cdef extern from  "unicode/ustring.h":
+    
+    UChar *u_strFromUTF32(UChar *dest, int32_t destCapacity,
+                          int32_t *pDestLength,
+                          UChar32 *src, int32_t srcLength,
+                          UErrorCode *status)
+
+cdef extern from  "unicode/ucol.h":
+
+    ctypedef struct UCollator:
+        pass
+    UCollator *ucol_open(char *locale, UErrorCode *status)
+    void ucol_close(UCollator *collator)
+    int32_t ucol_getSortKey(UCollator *coll,
+                            UChar *source, int32_t sourceLength,
+                            uint8_t *result,
+                            int32_t resultLength
+                            )
+    int ucol_strcoll(UCollator *coll,
+                     UChar *source, int32_t sourceLength,
+                     UChar *target, int32_t targetLength)
+
+cdef extern from  "Python.h":
+
+    int PyUnicode_Check(ob)
+    int PyString_Check(ob)
+
+    ctypedef int Py_UNICODE
+    Py_UNICODE *PyUnicode_AS_UNICODE(ob)
+    int PyUnicode_GET_SIZE(ob)
+    char *PyString_AS_STRING(ob)
+
+    void *PyMem_Malloc(int size)
+    void PyMem_Free(void *p)
+    object PyString_FromStringAndSize(char *v, int l)
+    
+    
+cdef class UCharString:
+    """Wrapper for ICU UChar arrays
+    """
+
+    cdef UChar *data
+    cdef readonly int32_t length
+    cdef readonly object base
+    cdef readonly int need_to_free
+
+    def __cinit__(self, text):
+        cdef int32_t buffsize
+        cdef UErrorCode status
+        cdef Py_UNICODE *str
+        cdef int length
+
+        if not PyUnicode_Check(text):
+            if PyString_Check(text):
+                text = unicode(text)
+                assert PyUnicode_Check(text)
+            else:
+                raise TypeError("Expected unicode string")
+
+        length = PyUnicode_GET_SIZE(text)
+        str = PyUnicode_AS_UNICODE(text)
+        
+
+        if sizeof(Py_UNICODE) == 2:
+            self.data = str
+            self.length = length
+            self.base = text
+            self.need_to_free = 0
+        else:
+            buffsize = 2*length + 1
+            self.data = <UChar*>PyMem_Malloc(buffsize*sizeof(UChar))
+            if self.data == NULL:
+                raise MemoryError
+            status = U_ZERO_ERROR
+            u_strFromUTF32(self.data, buffsize, &(self.length),
+                           <UChar32*>str, length, &status)
+            assert self.length <= buffsize
+            self.need_to_free = 1
+            if U_FAILURE(status):
+                raise ValueError(
+                    "Couldn't convert Python unicode data to ICU unicode data."
+                    )
+
+    def __dealloc__(self):
+        if self.need_to_free and self.data != NULL:
+            PyMem_Free(self.data)
+            self.data = NULL
+
+
+cdef class Collator:
+    """Compute a collation key for a unicode string.
+    """
+
+    cdef UCollator *collator
+    cdef readonly object locale
+    cdef readonly int used_default_information
+
+    def __cinit__(self, locale):
+        cdef UCollator *collator
+        cdef UErrorCode status
+
+        if not PyString_Check(locale):
+            raise TypeError("String locale expected")
+        
+        status = U_ZERO_ERROR
+        collator = ucol_open(PyString_AS_STRING(locale), &status)
+        if U_FAILURE(status):
+            raise ValueError("Couldn't create a collator")
+        self.collator = collator
+        self.locale = locale
+        if (status == U_USING_DEFAULT_WARNING
+            or status == U_USING_FALLBACK_WARNING):
+            status = U_ILLEGAL_ARGUMENT_ERROR
+        self.used_default_information = status
+
+    def __dealloc__(self):
+        if self.collator != NULL:
+            ucol_close(self.collator)
+
+    def key(self, text):
+        """Compute a collation key for the given unicode text.
+
+        Of course, the key is only valid for the given locale.
+        """
+        cdef char *buffer
+        cdef int32_t bufsize
+        cdef int32_t size
+
+        icutext = UCharString(text)
+        bufsize = (<UCharString>icutext).length*2+10
+
+        # the +1 below is needed to avoid an apprent buffer overflow bug in ICU
+        buffer = <char*>PyMem_Malloc(bufsize +1)
+        if buffer == NULL:
+            raise MemoryError
+        size = ucol_getSortKey(self.collator,
+                               (<UCharString>icutext).data,
+                               (<UCharString>icutext).length,
+                               <uint8_t*>buffer, bufsize)
+        while size > bufsize:
+            bufsize = size
+            PyMem_Free(buffer)
+            buffer = <char*>PyMem_Malloc(bufsize +1) # See above +1
+            if buffer == NULL:
+                raise MemoryError
+            size = ucol_getSortKey(self.collator,
+                                   (<UCharString>icutext).data,
+                                   (<UCharString>icutext).length,
+                                   <uint8_t*>buffer, bufsize)
+
+        result = PyString_FromStringAndSize(buffer, size)
+        PyMem_Free(buffer)
+        return result
+
+    def cmp(self, o1, o2):
+        u1 = UCharString(o1)
+        u2 = UCharString(o2)
+        return ucol_strcoll(
+            self.collator,
+            (<UCharString>u1).data,
+            (<UCharString>u1).length,
+            (<UCharString>u2).data,
+            (<UCharString>u2).length,
+            )
diff --git a/setup.py b/setup.py
new file mode 100755 (executable)
index 0000000..cf14037
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python
+
+import sys
+
+from distutils.core import setup, Extension
+from Pyrex.Distutils import build_ext
+
+if sys.platform.startswith('win'):
+    libraries = ['icuin', 'icuuc', 'icudt']
+else:
+    libraries = ['icui18n', 'icuuc', 'icudata']
+
+setup(name='collate',
+      version='0',
+      author="Joe Wreschnig",
+      author_email="joe.wreschnig@gmail.com",
+      description="Python text collation",
+      license="MIT / ZPL 2.1",
+      ext_modules=[
+          Extension('collate.icu._ucol',
+                    ['collate/icu/_ucol.pyx'],
+                    libraries=libraries)],
+      cmdclass=dict(build_ext=build_ext),
+      packages=["collate", "collate.icu", "collate.uca", "collate.syslocale"],
+      )