Word-splitting.
[python-collate.git] / collate / icu / _icu.pyx
1 ##############################################################################
2 #
3 # Copyright (c) 2004 Zope Corporation and Contributors.
4 # All Rights Reserved.
5 #
6 # This software is subject to the provisions of the Zope Public License,
7 # Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
8 # THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
9 # WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
10 # WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
11 # FOR A PARTICULAR PURPOSE.
12 #
13 ##############################################################################
14 """Simple wrapper for ICU ucol API
15
16 """
17
18 import sys
19
20 cdef extern from "unicode/utypes.h":
21
22 cdef enum UErrorCode:
23 U_USING_DEFAULT_WARNING = -127
24 U_USING_FALLBACK_WARNING = -128
25 U_ZERO_ERROR = 0
26 U_ILLEGAL_ARGUMENT_ERROR = 1
27 ctypedef int int32_t
28 ctypedef char uint8_t
29 int U_FAILURE(UErrorCode status)
30
31 cdef extern from "unicode/utf.h":
32
33 ctypedef int UChar
34 ctypedef int UChar32
35
36 cdef extern from "unicode/ustring.h":
37
38 UChar *u_strFromUTF32(UChar *dest, int32_t destCapacity,
39 int32_t *pDestLength,
40 UChar32 *src, int32_t srcLength,
41 UErrorCode *status)
42
43 cdef extern from "unicode/ucol.h":
44
45 ctypedef struct UCollator:
46 pass
47 UCollator *ucol_open(char *locale, UErrorCode *status)
48 void ucol_close(UCollator *collator)
49 int32_t ucol_getSortKey(UCollator *coll,
50 UChar *source, int32_t sourceLength,
51 uint8_t *result,
52 int32_t resultLength
53 )
54 int ucol_strcoll(UCollator *coll,
55 UChar *source, int32_t sourceLength,
56 UChar *target, int32_t targetLength)
57
58 cdef extern from "unicode/ubrk.h":
59 cdef enum UBreakIteratorType:
60 UBRK_CHARACTER = 0
61 UBRK_WORD = 1
62 UBRK_LINE = 2
63 UBRK_SENTENCE = 3
64 # UBRK_TITLE = 4 # Deprecated
65 UBRK_COUNT = 5
66
67 DEF UBRK_DONE = ((int)(-1))
68
69 ctypedef struct UBreakIterator:
70 pass
71
72 UBreakIterator *ubrk_open(UBreakIteratorType type,
73 char *locale,
74 UChar *text,
75 int32_t textLength,
76 UErrorCode *status)
77 void ubrk_close(UBreakIterator *bi)
78 void ubrk_setText(UBreakIterator *bi,
79 UChar *text,
80 int32_t textLength,
81 UErrorCode *status)
82 int32_t ubrk_current(UBreakIterator *bi)
83 int32_t ubrk_next(UBreakIterator *bi)
84 int32_t ubrk_previous(UBreakIterator *bi)
85 int32_t ubrk_first(UBreakIterator *bi)
86 int32_t ubrk_last(UBreakIterator *bi)
87 int32_t ubrk_preceding(UBreakIterator *bi, int32_t offset)
88 int32_t ubrk_following(UBreakIterator *bi, int32_t offset)
89
90
91 cdef extern from "Python.h":
92
93 int PyUnicode_Check(ob)
94 int PyString_Check(ob)
95
96 ctypedef int Py_UNICODE
97 Py_UNICODE *PyUnicode_AS_UNICODE(ob)
98 int PyUnicode_GET_SIZE(ob)
99 char *PyString_AS_STRING(ob)
100
101 void *PyMem_Malloc(int size)
102 void PyMem_Free(void *p)
103 object PyString_FromStringAndSize(char *v, int l)
104
105
106 cdef class UCharString:
107 """Wrapper for ICU UChar arrays
108 """
109
110 cdef UChar *data
111 cdef readonly int32_t length
112 cdef readonly object base
113 cdef readonly int need_to_free
114
115 def __cinit__(self, text):
116 cdef int32_t buffsize
117 cdef UErrorCode status
118 cdef Py_UNICODE *str
119 cdef int length
120
121 if not PyUnicode_Check(text):
122 if PyString_Check(text):
123 text = unicode(text)
124 assert PyUnicode_Check(text)
125 else:
126 raise TypeError("Expected unicode string")
127
128 length = PyUnicode_GET_SIZE(text)
129 str = PyUnicode_AS_UNICODE(text)
130
131
132 if sizeof(Py_UNICODE) == 2:
133 self.data = str
134 self.length = length
135 self.base = text
136 self.need_to_free = 0
137 else:
138 buffsize = 2*length + 1
139 self.data = <UChar*>PyMem_Malloc(buffsize*sizeof(UChar))
140 if self.data == NULL:
141 raise MemoryError
142 status = U_ZERO_ERROR
143 u_strFromUTF32(self.data, buffsize, &(self.length),
144 <UChar32*>str, length, &status)
145 assert self.length <= buffsize
146 self.need_to_free = 1
147 if U_FAILURE(status):
148 raise ValueError(
149 "Couldn't convert Python unicode data to ICU unicode data."
150 )
151
152 def __dealloc__(self):
153 if self.need_to_free and self.data != NULL:
154 PyMem_Free(self.data)
155 self.data = NULL
156
157
158 cdef class Collator:
159 """Compute a collation key for a unicode string.
160 """
161
162 cdef UCollator *collator
163 cdef readonly object locale
164 cdef readonly int used_default_information
165
166 def __cinit__(self, locale):
167 cdef UCollator *collator
168 cdef UErrorCode status
169
170 if not PyString_Check(locale):
171 raise TypeError("String locale expected")
172
173 status = U_ZERO_ERROR
174 collator = ucol_open(PyString_AS_STRING(locale), &status)
175 if U_FAILURE(status):
176 raise ValueError("Couldn't create a collator")
177 self.collator = collator
178 self.locale = locale
179 if (status == U_USING_DEFAULT_WARNING
180 or status == U_USING_FALLBACK_WARNING):
181 status = U_ILLEGAL_ARGUMENT_ERROR
182 self.used_default_information = status
183
184 def __dealloc__(self):
185 if self.collator != NULL:
186 ucol_close(self.collator)
187
188 def key(self, text):
189 """Compute a collation key for the given unicode text.
190
191 Of course, the key is only valid for the given locale.
192 """
193 cdef char *buffer
194 cdef int32_t bufsize
195 cdef int32_t size
196
197 icutext = UCharString(text)
198 bufsize = (<UCharString>icutext).length*2+10
199
200 # the +1 below is needed to avoid an apprent buffer overflow bug in ICU
201 buffer = <char*>PyMem_Malloc(bufsize +1)
202 if buffer == NULL:
203 raise MemoryError
204 size = ucol_getSortKey(self.collator,
205 (<UCharString>icutext).data,
206 (<UCharString>icutext).length,
207 <uint8_t*>buffer, bufsize)
208 while size > bufsize:
209 bufsize = size
210 PyMem_Free(buffer)
211 buffer = <char*>PyMem_Malloc(bufsize +1) # See above +1
212 if buffer == NULL:
213 raise MemoryError
214 size = ucol_getSortKey(self.collator,
215 (<UCharString>icutext).data,
216 (<UCharString>icutext).length,
217 <uint8_t*>buffer, bufsize)
218
219 result = PyString_FromStringAndSize(buffer, size)
220 PyMem_Free(buffer)
221 return result
222
223 def cmp(self, o1, o2):
224 u1 = UCharString(o1)
225 u2 = UCharString(o2)
226 return ucol_strcoll(
227 self.collator,
228 (<UCharString>u1).data,
229 (<UCharString>u1).length,
230 (<UCharString>u2).data,
231 (<UCharString>u2).length,
232 )
233
234 cdef class WordBreaker:
235 cdef UBreakIterator *breaker
236 cdef readonly object locale
237
238 def __cinit__(self, locale):
239 cdef UBreakIterator *breaker
240 cdef UErrorCode status
241 cdef char *clocale
242 status = U_ZERO_ERROR
243 clocale = PyString_AS_STRING(locale)
244 breaker = ubrk_open(UBRK_WORD, clocale, NULL, 0, &status)
245 if U_FAILURE(status):
246 raise ValueError("Couldn't create a breaker")
247 if ((status == U_USING_DEFAULT_WARNING
248 or status == U_USING_FALLBACK_WARNING)
249 and locale != "root" and locale != "C"):
250 raise ValueError("Invalid locale %s" % locale)
251 self.breaker = breaker
252 self.locale = locale
253
254 def words(self, string):
255 cdef UErrorCode status
256 cdef UCharString uni
257 status = U_ZERO_ERROR
258 uni = UCharString(string)
259 ubrk_setText(self.breaker,
260 (<UCharString>uni).data,
261 (<UCharString>uni).length, &status)
262 if U_FAILURE(status):
263 raise ValueError("Couldn't set text to %s: %d" % (string, status))
264 p = ubrk_first(self.breaker)
265 words = []
266 while p != UBRK_DONE:
267 n = ubrk_next(self.breaker)
268 if p != n and n != UBRK_DONE:
269 words.append(string[p:n])
270 p = n
271 return words
272
273 def __dealloc__(self):
274 if self.breaker != NULL:
275 ubrk_close(self.breaker)
276