1 ##############################################################################
3 # Copyright (c) 2004 Zope Corporation and Contributors.
6 # This software is subject to the provisions of the Zope Public License,
7 # Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
8 # THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
9 # WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
10 # WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
11 # FOR A PARTICULAR PURPOSE.
13 ##############################################################################
14 """Simple wrapper for ICU ucol API
20 cdef extern from "unicode/utypes.h":
23 U_USING_DEFAULT_WARNING = -127
24 U_USING_FALLBACK_WARNING = -128
26 U_ILLEGAL_ARGUMENT_ERROR = 1
29 int U_FAILURE(UErrorCode status)
31 cdef extern from "unicode/utf.h":
36 cdef extern from "unicode/ustring.h":
38 UChar *u_strFromUTF32(UChar *dest, int32_t destCapacity,
40 UChar32 *src, int32_t srcLength,
43 cdef extern from "unicode/ucol.h":
45 ctypedef struct UCollator:
47 UCollator *ucol_open(char *locale, UErrorCode *status)
48 void ucol_close(UCollator *collator)
49 int32_t ucol_getSortKey(UCollator *coll,
50 UChar *source, int32_t sourceLength,
54 int ucol_strcoll(UCollator *coll,
55 UChar *source, int32_t sourceLength,
56 UChar *target, int32_t targetLength)
58 cdef extern from "unicode/ubrk.h":
59 cdef enum UBreakIteratorType:
64 # UBRK_TITLE = 4 # Deprecated
67 DEF UBRK_DONE = ((int)(-1))
69 ctypedef struct UBreakIterator:
72 UBreakIterator *ubrk_open(UBreakIteratorType type,
77 void ubrk_close(UBreakIterator *bi)
78 void ubrk_setText(UBreakIterator *bi,
82 int32_t ubrk_current(UBreakIterator *bi)
83 int32_t ubrk_next(UBreakIterator *bi)
84 int32_t ubrk_previous(UBreakIterator *bi)
85 int32_t ubrk_first(UBreakIterator *bi)
86 int32_t ubrk_last(UBreakIterator *bi)
87 int32_t ubrk_preceding(UBreakIterator *bi, int32_t offset)
88 int32_t ubrk_following(UBreakIterator *bi, int32_t offset)
91 cdef extern from "Python.h":
93 int PyUnicode_Check(ob)
94 int PyString_Check(ob)
96 ctypedef int Py_UNICODE
97 Py_UNICODE *PyUnicode_AS_UNICODE(ob)
98 int PyUnicode_GET_SIZE(ob)
99 char *PyString_AS_STRING(ob)
101 void *PyMem_Malloc(int size)
102 void PyMem_Free(void *p)
103 object PyString_FromStringAndSize(char *v, int l)
106 cdef class UCharString:
107 """Wrapper for ICU UChar arrays
111 cdef readonly int32_t length
112 cdef readonly object base
113 cdef readonly int need_to_free
115 def __cinit__(self, text):
116 cdef int32_t buffsize
117 cdef UErrorCode status
121 if not PyUnicode_Check(text):
122 if PyString_Check(text):
124 assert PyUnicode_Check(text)
126 raise TypeError("Expected unicode string")
128 length = PyUnicode_GET_SIZE(text)
129 str = PyUnicode_AS_UNICODE(text)
132 if sizeof(Py_UNICODE) == 2:
136 self.need_to_free = 0
138 buffsize = 2*length + 1
139 self.data = <UChar*>PyMem_Malloc(buffsize*sizeof(UChar))
140 if self.data == NULL:
142 status = U_ZERO_ERROR
143 u_strFromUTF32(self.data, buffsize, &(self.length),
144 <UChar32*>str, length, &status)
145 assert self.length <= buffsize
146 self.need_to_free = 1
147 if U_FAILURE(status):
149 "Couldn't convert Python unicode data to ICU unicode data."
152 def __dealloc__(self):
153 if self.need_to_free and self.data != NULL:
154 PyMem_Free(self.data)
159 """Compute a collation key for a unicode string.
162 cdef UCollator *collator
163 cdef readonly object locale
164 cdef readonly int used_default_information
166 def __cinit__(self, locale):
167 cdef UCollator *collator
168 cdef UErrorCode status
170 if not PyString_Check(locale):
171 raise TypeError("String locale expected")
173 status = U_ZERO_ERROR
174 collator = ucol_open(PyString_AS_STRING(locale), &status)
175 if U_FAILURE(status):
176 raise ValueError("Couldn't create a collator")
177 self.collator = collator
179 if (status == U_USING_DEFAULT_WARNING
180 or status == U_USING_FALLBACK_WARNING):
181 status = U_ILLEGAL_ARGUMENT_ERROR
182 self.used_default_information = status
184 def __dealloc__(self):
185 if self.collator != NULL:
186 ucol_close(self.collator)
189 """Compute a collation key for the given unicode text.
191 Of course, the key is only valid for the given locale.
197 icutext = UCharString(text)
198 bufsize = (<UCharString>icutext).length*2+10
200 # the +1 below is needed to avoid an apprent buffer overflow bug in ICU
201 buffer = <char*>PyMem_Malloc(bufsize +1)
204 size = ucol_getSortKey(self.collator,
205 (<UCharString>icutext).data,
206 (<UCharString>icutext).length,
207 <uint8_t*>buffer, bufsize)
208 while size > bufsize:
211 buffer = <char*>PyMem_Malloc(bufsize +1) # See above +1
214 size = ucol_getSortKey(self.collator,
215 (<UCharString>icutext).data,
216 (<UCharString>icutext).length,
217 <uint8_t*>buffer, bufsize)
219 result = PyString_FromStringAndSize(buffer, size)
223 def cmp(self, o1, o2):
228 (<UCharString>u1).data,
229 (<UCharString>u1).length,
230 (<UCharString>u2).data,
231 (<UCharString>u2).length,
234 cdef class WordBreaker:
235 cdef UBreakIterator *breaker
236 cdef readonly object locale
238 def __cinit__(self, locale):
239 cdef UBreakIterator *breaker
240 cdef UErrorCode status
242 status = U_ZERO_ERROR
243 clocale = PyString_AS_STRING(locale)
244 breaker = ubrk_open(UBRK_WORD, clocale, NULL, 0, &status)
245 if U_FAILURE(status):
246 raise ValueError("Couldn't create a breaker")
247 if ((status == U_USING_DEFAULT_WARNING
248 or status == U_USING_FALLBACK_WARNING)
249 and locale != "root" and locale != "C"):
250 raise ValueError("Invalid locale %s" % locale)
251 self.breaker = breaker
254 def words(self, string):
255 cdef UErrorCode status
257 status = U_ZERO_ERROR
258 uni = UCharString(string)
259 ubrk_setText(self.breaker,
260 (<UCharString>uni).data,
261 (<UCharString>uni).length, &status)
262 if U_FAILURE(status):
263 raise ValueError("Couldn't set text to %s: %d" % (string, status))
264 p = ubrk_first(self.breaker)
266 while p != UBRK_DONE:
267 n = ubrk_next(self.breaker)
268 if p != n and n != UBRK_DONE:
269 words.append(string[p:n])
273 def __dealloc__(self):
274 if self.breaker != NULL:
275 ubrk_close(self.breaker)