ICU backend; uses Pyrex, based on zope.ucol.
[python-collate.git] / collate / icu / _ucol.pyx
1 ##############################################################################
2 #
3 # Copyright (c) 2004 Zope Corporation and Contributors.
4 # All Rights Reserved.
5 #
6 # This software is subject to the provisions of the Zope Public License,
7 # Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
8 # THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
9 # WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
10 # WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
11 # FOR A PARTICULAR PURPOSE.
12 #
13 ##############################################################################
14 """Simple wrapper for ICU ucol API
15
16 """
17
18 import sys
19
20 cdef extern from "unicode/utypes.h":
21
22 cdef enum UErrorCode:
23 U_USING_DEFAULT_WARNING = -127
24 U_USING_FALLBACK_WARNING = -128
25 U_ZERO_ERROR = 0
26 U_ILLEGAL_ARGUMENT_ERROR = 1
27 ctypedef int int32_t
28 ctypedef char uint8_t
29 int U_FAILURE(UErrorCode status)
30
31 cdef extern from "unicode/utf.h":
32
33 ctypedef int UChar
34 ctypedef int UChar32
35
36 cdef extern from "unicode/ustring.h":
37
38 UChar *u_strFromUTF32(UChar *dest, int32_t destCapacity,
39 int32_t *pDestLength,
40 UChar32 *src, int32_t srcLength,
41 UErrorCode *status)
42
43 cdef extern from "unicode/ucol.h":
44
45 ctypedef struct UCollator:
46 pass
47 UCollator *ucol_open(char *locale, UErrorCode *status)
48 void ucol_close(UCollator *collator)
49 int32_t ucol_getSortKey(UCollator *coll,
50 UChar *source, int32_t sourceLength,
51 uint8_t *result,
52 int32_t resultLength
53 )
54 int ucol_strcoll(UCollator *coll,
55 UChar *source, int32_t sourceLength,
56 UChar *target, int32_t targetLength)
57
58 cdef extern from "Python.h":
59
60 int PyUnicode_Check(ob)
61 int PyString_Check(ob)
62
63 ctypedef int Py_UNICODE
64 Py_UNICODE *PyUnicode_AS_UNICODE(ob)
65 int PyUnicode_GET_SIZE(ob)
66 char *PyString_AS_STRING(ob)
67
68 void *PyMem_Malloc(int size)
69 void PyMem_Free(void *p)
70 object PyString_FromStringAndSize(char *v, int l)
71
72
73 cdef class UCharString:
74 """Wrapper for ICU UChar arrays
75 """
76
77 cdef UChar *data
78 cdef readonly int32_t length
79 cdef readonly object base
80 cdef readonly int need_to_free
81
82 def __cinit__(self, text):
83 cdef int32_t buffsize
84 cdef UErrorCode status
85 cdef Py_UNICODE *str
86 cdef int length
87
88 if not PyUnicode_Check(text):
89 if PyString_Check(text):
90 text = unicode(text)
91 assert PyUnicode_Check(text)
92 else:
93 raise TypeError("Expected unicode string")
94
95 length = PyUnicode_GET_SIZE(text)
96 str = PyUnicode_AS_UNICODE(text)
97
98
99 if sizeof(Py_UNICODE) == 2:
100 self.data = str
101 self.length = length
102 self.base = text
103 self.need_to_free = 0
104 else:
105 buffsize = 2*length + 1
106 self.data = <UChar*>PyMem_Malloc(buffsize*sizeof(UChar))
107 if self.data == NULL:
108 raise MemoryError
109 status = U_ZERO_ERROR
110 u_strFromUTF32(self.data, buffsize, &(self.length),
111 <UChar32*>str, length, &status)
112 assert self.length <= buffsize
113 self.need_to_free = 1
114 if U_FAILURE(status):
115 raise ValueError(
116 "Couldn't convert Python unicode data to ICU unicode data."
117 )
118
119 def __dealloc__(self):
120 if self.need_to_free and self.data != NULL:
121 PyMem_Free(self.data)
122 self.data = NULL
123
124
125 cdef class Collator:
126 """Compute a collation key for a unicode string.
127 """
128
129 cdef UCollator *collator
130 cdef readonly object locale
131 cdef readonly int used_default_information
132
133 def __cinit__(self, locale):
134 cdef UCollator *collator
135 cdef UErrorCode status
136
137 if not PyString_Check(locale):
138 raise TypeError("String locale expected")
139
140 status = U_ZERO_ERROR
141 collator = ucol_open(PyString_AS_STRING(locale), &status)
142 if U_FAILURE(status):
143 raise ValueError("Couldn't create a collator")
144 self.collator = collator
145 self.locale = locale
146 if (status == U_USING_DEFAULT_WARNING
147 or status == U_USING_FALLBACK_WARNING):
148 status = U_ILLEGAL_ARGUMENT_ERROR
149 self.used_default_information = status
150
151 def __dealloc__(self):
152 if self.collator != NULL:
153 ucol_close(self.collator)
154
155 def key(self, text):
156 """Compute a collation key for the given unicode text.
157
158 Of course, the key is only valid for the given locale.
159 """
160 cdef char *buffer
161 cdef int32_t bufsize
162 cdef int32_t size
163
164 icutext = UCharString(text)
165 bufsize = (<UCharString>icutext).length*2+10
166
167 # the +1 below is needed to avoid an apprent buffer overflow bug in ICU
168 buffer = <char*>PyMem_Malloc(bufsize +1)
169 if buffer == NULL:
170 raise MemoryError
171 size = ucol_getSortKey(self.collator,
172 (<UCharString>icutext).data,
173 (<UCharString>icutext).length,
174 <uint8_t*>buffer, bufsize)
175 while size > bufsize:
176 bufsize = size
177 PyMem_Free(buffer)
178 buffer = <char*>PyMem_Malloc(bufsize +1) # See above +1
179 if buffer == NULL:
180 raise MemoryError
181 size = ucol_getSortKey(self.collator,
182 (<UCharString>icutext).data,
183 (<UCharString>icutext).length,
184 <uint8_t*>buffer, bufsize)
185
186 result = PyString_FromStringAndSize(buffer, size)
187 PyMem_Free(buffer)
188 return result
189
190 def cmp(self, o1, o2):
191 u1 = UCharString(o1)
192 u2 = UCharString(o2)
193 return ucol_strcoll(
194 self.collator,
195 (<UCharString>u1).data,
196 (<UCharString>u1).length,
197 (<UCharString>u2).data,
198 (<UCharString>u2).length,
199 )