def cmp(self, string1, string2):
"""Return negative if a < b, zero if a == b, positive if a > b."""
return cmp(self.key(string1), self.key(string2))
+
+ def words(self, string):
+ """Split the string into separate words.
+
+ This split is done using Unicode's definition of whitespace.
+ """
+ return string.split()
"""
-import collate.icu._ucol
import collate._abcollator
import collate._locale
import collate.errors
+from collate.icu import _icu
+
class Collator(collate._abcollator.Collator):
"""ICU-based collation."""
def __init__(self, locale, encoding=None):
locale, encoding = collate._locale.getpair(locale, encoding)
- self._collator = collate.icu._ucol.Collator(locale)
+ icu_locale = "root" if locale == "C" else locale
+ self._collator = _icu.Collator(icu_locale)
self.locale = self._collator.locale
self.encoding = collate._locale.encoding(encoding)
if self._collator.used_default_information and locale != "C":
raise collate.errors.InvalidLocaleError(locale)
+ try:
+ self._breaker = _icu.WordBreaker(icu_locale)
+ except ValueError:
+ # Thai is the only language with a special break locale,
+ # so this is a harmless error.
+ self._breaker = _icu.WordBreaker("root")
+
+ def words(self, string):
+ if isinstance(string, str):
+ string = string.decode(self.encoding, 'replace')
+ return filter(lambda u: not u.isspace(), self._breaker.words(string))
+
def key(self, string):
"""Sort key for a string.
--- /dev/null
+##############################################################################
+#
+# Copyright (c) 2004 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+"""Simple wrapper for ICU ucol API
+
+"""
+
+import sys
+
+cdef extern from "unicode/utypes.h":
+
+ cdef enum UErrorCode:
+ U_USING_DEFAULT_WARNING = -127
+ U_USING_FALLBACK_WARNING = -128
+ U_ZERO_ERROR = 0
+ U_ILLEGAL_ARGUMENT_ERROR = 1
+ ctypedef int int32_t
+ ctypedef char uint8_t
+ int U_FAILURE(UErrorCode status)
+
+cdef extern from "unicode/utf.h":
+
+ ctypedef int UChar
+ ctypedef int UChar32
+
+cdef extern from "unicode/ustring.h":
+
+ UChar *u_strFromUTF32(UChar *dest, int32_t destCapacity,
+ int32_t *pDestLength,
+ UChar32 *src, int32_t srcLength,
+ UErrorCode *status)
+
+cdef extern from "unicode/ucol.h":
+
+ ctypedef struct UCollator:
+ pass
+ UCollator *ucol_open(char *locale, UErrorCode *status)
+ void ucol_close(UCollator *collator)
+ int32_t ucol_getSortKey(UCollator *coll,
+ UChar *source, int32_t sourceLength,
+ uint8_t *result,
+ int32_t resultLength
+ )
+ int ucol_strcoll(UCollator *coll,
+ UChar *source, int32_t sourceLength,
+ UChar *target, int32_t targetLength)
+
+cdef extern from "unicode/ubrk.h":
+ cdef enum UBreakIteratorType:
+ UBRK_CHARACTER = 0
+ UBRK_WORD = 1
+ UBRK_LINE = 2
+ UBRK_SENTENCE = 3
+ # UBRK_TITLE = 4 # Deprecated
+ UBRK_COUNT = 5
+
+ DEF UBRK_DONE = ((int)(-1))
+
+ ctypedef struct UBreakIterator:
+ pass
+
+ UBreakIterator *ubrk_open(UBreakIteratorType type,
+ char *locale,
+ UChar *text,
+ int32_t textLength,
+ UErrorCode *status)
+ void ubrk_close(UBreakIterator *bi)
+ void ubrk_setText(UBreakIterator *bi,
+ UChar *text,
+ int32_t textLength,
+ UErrorCode *status)
+ int32_t ubrk_current(UBreakIterator *bi)
+ int32_t ubrk_next(UBreakIterator *bi)
+ int32_t ubrk_previous(UBreakIterator *bi)
+ int32_t ubrk_first(UBreakIterator *bi)
+ int32_t ubrk_last(UBreakIterator *bi)
+ int32_t ubrk_preceding(UBreakIterator *bi, int32_t offset)
+ int32_t ubrk_following(UBreakIterator *bi, int32_t offset)
+
+
+cdef extern from "Python.h":
+
+ int PyUnicode_Check(ob)
+ int PyString_Check(ob)
+
+ ctypedef int Py_UNICODE
+ Py_UNICODE *PyUnicode_AS_UNICODE(ob)
+ int PyUnicode_GET_SIZE(ob)
+ char *PyString_AS_STRING(ob)
+
+ void *PyMem_Malloc(int size)
+ void PyMem_Free(void *p)
+ object PyString_FromStringAndSize(char *v, int l)
+
+
+cdef class UCharString:
+ """Wrapper for ICU UChar arrays
+ """
+
+ cdef UChar *data
+ cdef readonly int32_t length
+ cdef readonly object base
+ cdef readonly int need_to_free
+
+ def __cinit__(self, text):
+ cdef int32_t buffsize
+ cdef UErrorCode status
+ cdef Py_UNICODE *str
+ cdef int length
+
+ if not PyUnicode_Check(text):
+ if PyString_Check(text):
+ text = unicode(text)
+ assert PyUnicode_Check(text)
+ else:
+ raise TypeError("Expected unicode string")
+
+ length = PyUnicode_GET_SIZE(text)
+ str = PyUnicode_AS_UNICODE(text)
+
+
+ if sizeof(Py_UNICODE) == 2:
+ self.data = str
+ self.length = length
+ self.base = text
+ self.need_to_free = 0
+ else:
+ buffsize = 2*length + 1
+ self.data = <UChar*>PyMem_Malloc(buffsize*sizeof(UChar))
+ if self.data == NULL:
+ raise MemoryError
+ status = U_ZERO_ERROR
+ u_strFromUTF32(self.data, buffsize, &(self.length),
+ <UChar32*>str, length, &status)
+ assert self.length <= buffsize
+ self.need_to_free = 1
+ if U_FAILURE(status):
+ raise ValueError(
+ "Couldn't convert Python unicode data to ICU unicode data."
+ )
+
+ def __dealloc__(self):
+ if self.need_to_free and self.data != NULL:
+ PyMem_Free(self.data)
+ self.data = NULL
+
+
+cdef class Collator:
+ """Compute a collation key for a unicode string.
+ """
+
+ cdef UCollator *collator
+ cdef readonly object locale
+ cdef readonly int used_default_information
+
+ def __cinit__(self, locale):
+ cdef UCollator *collator
+ cdef UErrorCode status
+
+ if not PyString_Check(locale):
+ raise TypeError("String locale expected")
+
+ status = U_ZERO_ERROR
+ collator = ucol_open(PyString_AS_STRING(locale), &status)
+ if U_FAILURE(status):
+ raise ValueError("Couldn't create a collator")
+ self.collator = collator
+ self.locale = locale
+ if (status == U_USING_DEFAULT_WARNING
+ or status == U_USING_FALLBACK_WARNING):
+ status = U_ILLEGAL_ARGUMENT_ERROR
+ self.used_default_information = status
+
+ def __dealloc__(self):
+ if self.collator != NULL:
+ ucol_close(self.collator)
+
+ def key(self, text):
+ """Compute a collation key for the given unicode text.
+
+ Of course, the key is only valid for the given locale.
+ """
+ cdef char *buffer
+ cdef int32_t bufsize
+ cdef int32_t size
+
+ icutext = UCharString(text)
+ bufsize = (<UCharString>icutext).length*2+10
+
+ # the +1 below is needed to avoid an apprent buffer overflow bug in ICU
+ buffer = <char*>PyMem_Malloc(bufsize +1)
+ if buffer == NULL:
+ raise MemoryError
+ size = ucol_getSortKey(self.collator,
+ (<UCharString>icutext).data,
+ (<UCharString>icutext).length,
+ <uint8_t*>buffer, bufsize)
+ while size > bufsize:
+ bufsize = size
+ PyMem_Free(buffer)
+ buffer = <char*>PyMem_Malloc(bufsize +1) # See above +1
+ if buffer == NULL:
+ raise MemoryError
+ size = ucol_getSortKey(self.collator,
+ (<UCharString>icutext).data,
+ (<UCharString>icutext).length,
+ <uint8_t*>buffer, bufsize)
+
+ result = PyString_FromStringAndSize(buffer, size)
+ PyMem_Free(buffer)
+ return result
+
+ def cmp(self, o1, o2):
+ u1 = UCharString(o1)
+ u2 = UCharString(o2)
+ return ucol_strcoll(
+ self.collator,
+ (<UCharString>u1).data,
+ (<UCharString>u1).length,
+ (<UCharString>u2).data,
+ (<UCharString>u2).length,
+ )
+
+cdef class WordBreaker:
+ cdef UBreakIterator *breaker
+ cdef readonly object locale
+
+ def __cinit__(self, locale):
+ cdef UBreakIterator *breaker
+ cdef UErrorCode status
+ cdef char *clocale
+ status = U_ZERO_ERROR
+ clocale = PyString_AS_STRING(locale)
+ breaker = ubrk_open(UBRK_WORD, clocale, NULL, 0, &status)
+ if U_FAILURE(status):
+ raise ValueError("Couldn't create a breaker")
+ if ((status == U_USING_DEFAULT_WARNING
+ or status == U_USING_FALLBACK_WARNING)
+ and locale != "root" and locale != "C"):
+ raise ValueError("Invalid locale %s" % locale)
+ self.breaker = breaker
+ self.locale = locale
+
+ def words(self, string):
+ cdef UErrorCode status
+ cdef UCharString uni
+ status = U_ZERO_ERROR
+ uni = UCharString(string)
+ ubrk_setText(self.breaker,
+ (<UCharString>uni).data,
+ (<UCharString>uni).length, &status)
+ if U_FAILURE(status):
+ raise ValueError("Couldn't set text to %s: %d" % (string, status))
+ p = ubrk_first(self.breaker)
+ words = []
+ while p != UBRK_DONE:
+ n = ubrk_next(self.breaker)
+ if p != n and n != UBRK_DONE:
+ words.append(string[p:n])
+ p = n
+ return words
+
+ def __dealloc__(self):
+ if self.breaker != NULL:
+ ubrk_close(self.breaker)
+
+++ /dev/null
-##############################################################################
-#
-# Copyright (c) 2004 Zope Corporation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-"""Simple wrapper for ICU ucol API
-
-"""
-
-import sys
-
-cdef extern from "unicode/utypes.h":
-
- cdef enum UErrorCode:
- U_USING_DEFAULT_WARNING = -127
- U_USING_FALLBACK_WARNING = -128
- U_ZERO_ERROR = 0
- U_ILLEGAL_ARGUMENT_ERROR = 1
- ctypedef int int32_t
- ctypedef char uint8_t
- int U_FAILURE(UErrorCode status)
-
-cdef extern from "unicode/utf.h":
-
- ctypedef int UChar
- ctypedef int UChar32
-
-cdef extern from "unicode/ustring.h":
-
- UChar *u_strFromUTF32(UChar *dest, int32_t destCapacity,
- int32_t *pDestLength,
- UChar32 *src, int32_t srcLength,
- UErrorCode *status)
-
-cdef extern from "unicode/ucol.h":
-
- ctypedef struct UCollator:
- pass
- UCollator *ucol_open(char *locale, UErrorCode *status)
- void ucol_close(UCollator *collator)
- int32_t ucol_getSortKey(UCollator *coll,
- UChar *source, int32_t sourceLength,
- uint8_t *result,
- int32_t resultLength
- )
- int ucol_strcoll(UCollator *coll,
- UChar *source, int32_t sourceLength,
- UChar *target, int32_t targetLength)
-
-cdef extern from "Python.h":
-
- int PyUnicode_Check(ob)
- int PyString_Check(ob)
-
- ctypedef int Py_UNICODE
- Py_UNICODE *PyUnicode_AS_UNICODE(ob)
- int PyUnicode_GET_SIZE(ob)
- char *PyString_AS_STRING(ob)
-
- void *PyMem_Malloc(int size)
- void PyMem_Free(void *p)
- object PyString_FromStringAndSize(char *v, int l)
-
-
-cdef class UCharString:
- """Wrapper for ICU UChar arrays
- """
-
- cdef UChar *data
- cdef readonly int32_t length
- cdef readonly object base
- cdef readonly int need_to_free
-
- def __cinit__(self, text):
- cdef int32_t buffsize
- cdef UErrorCode status
- cdef Py_UNICODE *str
- cdef int length
-
- if not PyUnicode_Check(text):
- if PyString_Check(text):
- text = unicode(text)
- assert PyUnicode_Check(text)
- else:
- raise TypeError("Expected unicode string")
-
- length = PyUnicode_GET_SIZE(text)
- str = PyUnicode_AS_UNICODE(text)
-
-
- if sizeof(Py_UNICODE) == 2:
- self.data = str
- self.length = length
- self.base = text
- self.need_to_free = 0
- else:
- buffsize = 2*length + 1
- self.data = <UChar*>PyMem_Malloc(buffsize*sizeof(UChar))
- if self.data == NULL:
- raise MemoryError
- status = U_ZERO_ERROR
- u_strFromUTF32(self.data, buffsize, &(self.length),
- <UChar32*>str, length, &status)
- assert self.length <= buffsize
- self.need_to_free = 1
- if U_FAILURE(status):
- raise ValueError(
- "Couldn't convert Python unicode data to ICU unicode data."
- )
-
- def __dealloc__(self):
- if self.need_to_free and self.data != NULL:
- PyMem_Free(self.data)
- self.data = NULL
-
-
-cdef class Collator:
- """Compute a collation key for a unicode string.
- """
-
- cdef UCollator *collator
- cdef readonly object locale
- cdef readonly int used_default_information
-
- def __cinit__(self, locale):
- cdef UCollator *collator
- cdef UErrorCode status
-
- if not PyString_Check(locale):
- raise TypeError("String locale expected")
-
- status = U_ZERO_ERROR
- collator = ucol_open(PyString_AS_STRING(locale), &status)
- if U_FAILURE(status):
- raise ValueError("Couldn't create a collator")
- self.collator = collator
- self.locale = locale
- if (status == U_USING_DEFAULT_WARNING
- or status == U_USING_FALLBACK_WARNING):
- status = U_ILLEGAL_ARGUMENT_ERROR
- self.used_default_information = status
-
- def __dealloc__(self):
- if self.collator != NULL:
- ucol_close(self.collator)
-
- def key(self, text):
- """Compute a collation key for the given unicode text.
-
- Of course, the key is only valid for the given locale.
- """
- cdef char *buffer
- cdef int32_t bufsize
- cdef int32_t size
-
- icutext = UCharString(text)
- bufsize = (<UCharString>icutext).length*2+10
-
- # the +1 below is needed to avoid an apprent buffer overflow bug in ICU
- buffer = <char*>PyMem_Malloc(bufsize +1)
- if buffer == NULL:
- raise MemoryError
- size = ucol_getSortKey(self.collator,
- (<UCharString>icutext).data,
- (<UCharString>icutext).length,
- <uint8_t*>buffer, bufsize)
- while size > bufsize:
- bufsize = size
- PyMem_Free(buffer)
- buffer = <char*>PyMem_Malloc(bufsize +1) # See above +1
- if buffer == NULL:
- raise MemoryError
- size = ucol_getSortKey(self.collator,
- (<UCharString>icutext).data,
- (<UCharString>icutext).length,
- <uint8_t*>buffer, bufsize)
-
- result = PyString_FromStringAndSize(buffer, size)
- PyMem_Free(buffer)
- return result
-
- def cmp(self, o1, o2):
- u1 = UCharString(o1)
- u2 = UCharString(o2)
- return ucol_strcoll(
- self.collator,
- (<UCharString>u1).data,
- (<UCharString>u1).length,
- (<UCharString>u2).data,
- (<UCharString>u2).length,
- )
locale information.
Use this collation backend if...
- - You are on a system without ICU or UCA datafiles for the locale,
- and DUCET results are not acceptable.
+ - You are on a system without ICU.
Avoid this backend if...
- - ICU or UCA support is available for the current locale.
+ - ICU is available for the current locale.
- You are sorting strings from alphabets outside the primary locale.
- You need to support collating multiple locales at once.
- You need the same results across multiple platforms.
"""
import locale
+import re
import collate.errors
import collate._abcollator
b = b.decode(self.encoding, "replace")
return locale.strcoll(a, b)
+ def words(self, string, sep=re.compile(r"\W+", re.UNICODE)):
+ """Split the string into separate words.
+
+ This split is done using the locale's notion of a word boundry.
+ """
+ return re.split(sep, string)
+
description="Python text collation",
license="MIT / ZPL 2.1",
ext_modules=[
- Extension('collate.icu._ucol',
- ['collate/icu/_ucol.pyx'],
+ Extension('collate.icu._icu',
+ ['collate/icu/_icu.pyx'],
libraries=libraries)],
cmdclass=dict(build_ext=build_ext),
- packages=["collate", "collate.icu", "collate.uca", "collate.syslocale"],
+ packages=["collate", "collate.icu"],
)