Word-splitting.
[python-collate.git] / collate / icu / _icu.pyx
diff --git a/collate/icu/_icu.pyx b/collate/icu/_icu.pyx
new file mode 100644 (file)
index 0000000..c3b1484
--- /dev/null
@@ -0,0 +1,276 @@
+##############################################################################
+#
+# Copyright (c) 2004 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+"""Simple wrapper for ICU ucol API
+
+"""
+
+import sys
+
+cdef extern from "unicode/utypes.h":
+
+    cdef enum UErrorCode:
+        U_USING_DEFAULT_WARNING = -127
+        U_USING_FALLBACK_WARNING = -128
+        U_ZERO_ERROR = 0
+        U_ILLEGAL_ARGUMENT_ERROR = 1
+    ctypedef int int32_t
+    ctypedef char uint8_t
+    int U_FAILURE(UErrorCode status)
+
+cdef extern from "unicode/utf.h":
+
+    ctypedef int UChar
+    ctypedef int UChar32
+
+cdef extern from "unicode/ustring.h":
+    
+    UChar *u_strFromUTF32(UChar *dest, int32_t destCapacity,
+                          int32_t *pDestLength,
+                          UChar32 *src, int32_t srcLength,
+                          UErrorCode *status)
+
+cdef extern from "unicode/ucol.h":
+
+    ctypedef struct UCollator:
+        pass
+    UCollator *ucol_open(char *locale, UErrorCode *status)
+    void ucol_close(UCollator *collator)
+    int32_t ucol_getSortKey(UCollator *coll,
+                            UChar *source, int32_t sourceLength,
+                            uint8_t *result,
+                            int32_t resultLength
+                            )
+    int ucol_strcoll(UCollator *coll,
+                     UChar *source, int32_t sourceLength,
+                     UChar *target, int32_t targetLength)
+
+cdef extern from "unicode/ubrk.h":
+    cdef enum UBreakIteratorType:
+        UBRK_CHARACTER = 0
+        UBRK_WORD = 1
+        UBRK_LINE = 2
+        UBRK_SENTENCE = 3
+        # UBRK_TITLE = 4 # Deprecated
+        UBRK_COUNT = 5
+
+    DEF UBRK_DONE = ((int)(-1))
+
+    ctypedef struct UBreakIterator:
+        pass
+
+    UBreakIterator *ubrk_open(UBreakIteratorType type,
+                              char *locale,
+                              UChar *text,
+                              int32_t textLength,
+                              UErrorCode *status)
+    void ubrk_close(UBreakIterator *bi)
+    void ubrk_setText(UBreakIterator *bi,
+                      UChar *text,
+                      int32_t textLength,
+                      UErrorCode *status)
+    int32_t ubrk_current(UBreakIterator *bi)
+    int32_t ubrk_next(UBreakIterator *bi)
+    int32_t ubrk_previous(UBreakIterator *bi)
+    int32_t ubrk_first(UBreakIterator *bi)
+    int32_t ubrk_last(UBreakIterator *bi)
+    int32_t ubrk_preceding(UBreakIterator *bi, int32_t offset)
+    int32_t ubrk_following(UBreakIterator *bi, int32_t offset)
+
+
+cdef extern from "Python.h":
+
+    int PyUnicode_Check(ob)
+    int PyString_Check(ob)
+
+    ctypedef int Py_UNICODE
+    Py_UNICODE *PyUnicode_AS_UNICODE(ob)
+    int PyUnicode_GET_SIZE(ob)
+    char *PyString_AS_STRING(ob)
+
+    void *PyMem_Malloc(int size)
+    void PyMem_Free(void *p)
+    object PyString_FromStringAndSize(char *v, int l)
+    
+    
+cdef class UCharString:
+    """Wrapper for ICU UChar arrays
+    """
+
+    cdef UChar *data
+    cdef readonly int32_t length
+    cdef readonly object base
+    cdef readonly int need_to_free
+
+    def __cinit__(self, text):
+        cdef int32_t buffsize
+        cdef UErrorCode status
+        cdef Py_UNICODE *str
+        cdef int length
+
+        if not PyUnicode_Check(text):
+            if PyString_Check(text):
+                text = unicode(text)
+                assert PyUnicode_Check(text)
+            else:
+                raise TypeError("Expected unicode string")
+
+        length = PyUnicode_GET_SIZE(text)
+        str = PyUnicode_AS_UNICODE(text)
+        
+
+        if sizeof(Py_UNICODE) == 2:
+            self.data = str
+            self.length = length
+            self.base = text
+            self.need_to_free = 0
+        else:
+            buffsize = 2*length + 1
+            self.data = <UChar*>PyMem_Malloc(buffsize*sizeof(UChar))
+            if self.data == NULL:
+                raise MemoryError
+            status = U_ZERO_ERROR
+            u_strFromUTF32(self.data, buffsize, &(self.length),
+                           <UChar32*>str, length, &status)
+            assert self.length <= buffsize
+            self.need_to_free = 1
+            if U_FAILURE(status):
+                raise ValueError(
+                    "Couldn't convert Python unicode data to ICU unicode data."
+                    )
+
+    def __dealloc__(self):
+        if self.need_to_free and self.data != NULL:
+            PyMem_Free(self.data)
+            self.data = NULL
+
+
+cdef class Collator:
+    """Compute a collation key for a unicode string.
+    """
+
+    cdef UCollator *collator
+    cdef readonly object locale
+    cdef readonly int used_default_information
+
+    def __cinit__(self, locale):
+        cdef UCollator *collator
+        cdef UErrorCode status
+
+        if not PyString_Check(locale):
+            raise TypeError("String locale expected")
+        
+        status = U_ZERO_ERROR
+        collator = ucol_open(PyString_AS_STRING(locale), &status)
+        if U_FAILURE(status):
+            raise ValueError("Couldn't create a collator")
+        self.collator = collator
+        self.locale = locale
+        if (status == U_USING_DEFAULT_WARNING
+            or status == U_USING_FALLBACK_WARNING):
+            status = U_ILLEGAL_ARGUMENT_ERROR
+        self.used_default_information = status
+
+    def __dealloc__(self):
+        if self.collator != NULL:
+            ucol_close(self.collator)
+
+    def key(self, text):
+        """Compute a collation key for the given unicode text.
+
+        Of course, the key is only valid for the given locale.
+        """
+        cdef char *buffer
+        cdef int32_t bufsize
+        cdef int32_t size
+
+        icutext = UCharString(text)
+        bufsize = (<UCharString>icutext).length*2+10
+
+        # the +1 below is needed to avoid an apprent buffer overflow bug in ICU
+        buffer = <char*>PyMem_Malloc(bufsize +1)
+        if buffer == NULL:
+            raise MemoryError
+        size = ucol_getSortKey(self.collator,
+                               (<UCharString>icutext).data,
+                               (<UCharString>icutext).length,
+                               <uint8_t*>buffer, bufsize)
+        while size > bufsize:
+            bufsize = size
+            PyMem_Free(buffer)
+            buffer = <char*>PyMem_Malloc(bufsize +1) # See above +1
+            if buffer == NULL:
+                raise MemoryError
+            size = ucol_getSortKey(self.collator,
+                                   (<UCharString>icutext).data,
+                                   (<UCharString>icutext).length,
+                                   <uint8_t*>buffer, bufsize)
+
+        result = PyString_FromStringAndSize(buffer, size)
+        PyMem_Free(buffer)
+        return result
+
+    def cmp(self, o1, o2):
+        u1 = UCharString(o1)
+        u2 = UCharString(o2)
+        return ucol_strcoll(
+            self.collator,
+            (<UCharString>u1).data,
+            (<UCharString>u1).length,
+            (<UCharString>u2).data,
+            (<UCharString>u2).length,
+            )
+
+cdef class WordBreaker:
+    cdef UBreakIterator *breaker
+    cdef readonly object locale
+
+    def __cinit__(self, locale):
+        cdef UBreakIterator *breaker
+        cdef UErrorCode status
+        cdef char *clocale
+        status = U_ZERO_ERROR
+        clocale = PyString_AS_STRING(locale)
+        breaker = ubrk_open(UBRK_WORD, clocale, NULL, 0, &status)
+        if U_FAILURE(status):
+            raise ValueError("Couldn't create a breaker")
+        if ((status == U_USING_DEFAULT_WARNING
+             or status == U_USING_FALLBACK_WARNING)
+            and locale != "root" and locale != "C"):
+            raise ValueError("Invalid locale %s" % locale)
+        self.breaker = breaker
+        self.locale = locale
+
+    def words(self, string):
+        cdef UErrorCode status
+        cdef UCharString uni
+        status = U_ZERO_ERROR
+        uni = UCharString(string)
+        ubrk_setText(self.breaker,
+                     (<UCharString>uni).data,
+                     (<UCharString>uni).length, &status)
+        if U_FAILURE(status):
+            raise ValueError("Couldn't set text to %s: %d" % (string, status))
+        p = ubrk_first(self.breaker)
+        words = []
+        while p != UBRK_DONE:
+            n = ubrk_next(self.breaker)
+            if p != n and n != UBRK_DONE:
+                words.append(string[p:n])
+            p = n
+        return words
+
+    def __dealloc__(self):
+        if self.breaker != NULL:
+            ubrk_close(self.breaker)
+