X-Git-Url: https://git.korewanetadesu.com/?p=python-collate.git;a=blobdiff_plain;f=collate%2Ficu%2F_ucol.pyx;fp=collate%2Ficu%2F_ucol.pyx;h=e54dd5319604a1b0cdf0fde89aadcb0a26951d73;hp=0000000000000000000000000000000000000000;hb=c02c5c3c54d35e7d5836adf54aadac1f79906f05;hpb=29f1f7e12a4ca6100b00dc0d32e84f82f530bcb4 diff --git a/collate/icu/_ucol.pyx b/collate/icu/_ucol.pyx new file mode 100644 index 0000000..e54dd53 --- /dev/null +++ b/collate/icu/_ucol.pyx @@ -0,0 +1,199 @@ +############################################################################## +# +# Copyright (c) 2004 Zope Corporation and Contributors. +# All Rights Reserved. +# +# This software is subject to the provisions of the Zope Public License, +# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution. +# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED +# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS +# FOR A PARTICULAR PURPOSE. +# +############################################################################## +"""Simple wrapper for ICU ucol API + +""" + +import sys + +cdef extern from "unicode/utypes.h": + + cdef enum UErrorCode: + U_USING_DEFAULT_WARNING = -127 + U_USING_FALLBACK_WARNING = -128 + U_ZERO_ERROR = 0 + U_ILLEGAL_ARGUMENT_ERROR = 1 + ctypedef int int32_t + ctypedef char uint8_t + int U_FAILURE(UErrorCode status) + +cdef extern from "unicode/utf.h": + + ctypedef int UChar + ctypedef int UChar32 + +cdef extern from "unicode/ustring.h": + + UChar *u_strFromUTF32(UChar *dest, int32_t destCapacity, + int32_t *pDestLength, + UChar32 *src, int32_t srcLength, + UErrorCode *status) + +cdef extern from "unicode/ucol.h": + + ctypedef struct UCollator: + pass + UCollator *ucol_open(char *locale, UErrorCode *status) + void ucol_close(UCollator *collator) + int32_t ucol_getSortKey(UCollator *coll, + UChar *source, int32_t sourceLength, + uint8_t *result, + int32_t resultLength + ) + int ucol_strcoll(UCollator *coll, + UChar *source, int32_t sourceLength, + UChar *target, int32_t targetLength) + +cdef extern from "Python.h": + + int PyUnicode_Check(ob) + int PyString_Check(ob) + + ctypedef int Py_UNICODE + Py_UNICODE *PyUnicode_AS_UNICODE(ob) + int PyUnicode_GET_SIZE(ob) + char *PyString_AS_STRING(ob) + + void *PyMem_Malloc(int size) + void PyMem_Free(void *p) + object PyString_FromStringAndSize(char *v, int l) + + +cdef class UCharString: + """Wrapper for ICU UChar arrays + """ + + cdef UChar *data + cdef readonly int32_t length + cdef readonly object base + cdef readonly int need_to_free + + def __cinit__(self, text): + cdef int32_t buffsize + cdef UErrorCode status + cdef Py_UNICODE *str + cdef int length + + if not PyUnicode_Check(text): + if PyString_Check(text): + text = unicode(text) + assert PyUnicode_Check(text) + else: + raise TypeError("Expected unicode string") + + length = PyUnicode_GET_SIZE(text) + str = PyUnicode_AS_UNICODE(text) + + + if sizeof(Py_UNICODE) == 2: + self.data = str + self.length = length + self.base = text + self.need_to_free = 0 + else: + buffsize = 2*length + 1 + self.data = PyMem_Malloc(buffsize*sizeof(UChar)) + if self.data == NULL: + raise MemoryError + status = U_ZERO_ERROR + u_strFromUTF32(self.data, buffsize, &(self.length), + str, length, &status) + assert self.length <= buffsize + self.need_to_free = 1 + if U_FAILURE(status): + raise ValueError( + "Couldn't convert Python unicode data to ICU unicode data." + ) + + def __dealloc__(self): + if self.need_to_free and self.data != NULL: + PyMem_Free(self.data) + self.data = NULL + + +cdef class Collator: + """Compute a collation key for a unicode string. + """ + + cdef UCollator *collator + cdef readonly object locale + cdef readonly int used_default_information + + def __cinit__(self, locale): + cdef UCollator *collator + cdef UErrorCode status + + if not PyString_Check(locale): + raise TypeError("String locale expected") + + status = U_ZERO_ERROR + collator = ucol_open(PyString_AS_STRING(locale), &status) + if U_FAILURE(status): + raise ValueError("Couldn't create a collator") + self.collator = collator + self.locale = locale + if (status == U_USING_DEFAULT_WARNING + or status == U_USING_FALLBACK_WARNING): + status = U_ILLEGAL_ARGUMENT_ERROR + self.used_default_information = status + + def __dealloc__(self): + if self.collator != NULL: + ucol_close(self.collator) + + def key(self, text): + """Compute a collation key for the given unicode text. + + Of course, the key is only valid for the given locale. + """ + cdef char *buffer + cdef int32_t bufsize + cdef int32_t size + + icutext = UCharString(text) + bufsize = (icutext).length*2+10 + + # the +1 below is needed to avoid an apprent buffer overflow bug in ICU + buffer = PyMem_Malloc(bufsize +1) + if buffer == NULL: + raise MemoryError + size = ucol_getSortKey(self.collator, + (icutext).data, + (icutext).length, + buffer, bufsize) + while size > bufsize: + bufsize = size + PyMem_Free(buffer) + buffer = PyMem_Malloc(bufsize +1) # See above +1 + if buffer == NULL: + raise MemoryError + size = ucol_getSortKey(self.collator, + (icutext).data, + (icutext).length, + buffer, bufsize) + + result = PyString_FromStringAndSize(buffer, size) + PyMem_Free(buffer) + return result + + def cmp(self, o1, o2): + u1 = UCharString(o1) + u2 = UCharString(o2) + return ucol_strcoll( + self.collator, + (u1).data, + (u1).length, + (u2).data, + (u2).length, + )