From: Joe Wreschnig Date: Mon, 15 Feb 2010 09:20:21 +0000 (-0800) Subject: Word-splitting. X-Git-Url: https://git.korewanetadesu.com/?p=python-collate.git;a=commitdiff_plain;h=f73c4c6cd3ed326c5735ab33a6896697227d07e3 Word-splitting. --- diff --git a/collate/_abcollator.py b/collate/_abcollator.py index 71f5f54..99866c3 100644 --- a/collate/_abcollator.py +++ b/collate/_abcollator.py @@ -2,3 +2,10 @@ class Collator(object): def cmp(self, string1, string2): """Return negative if a < b, zero if a == b, positive if a > b.""" return cmp(self.key(string1), self.key(string2)) + + def words(self, string): + """Split the string into separate words. + + This split is done using Unicode's definition of whitespace. + """ + return string.split() diff --git a/collate/icu/__init__.py b/collate/icu/__init__.py index eeee418..00a1538 100644 --- a/collate/icu/__init__.py +++ b/collate/icu/__init__.py @@ -11,22 +11,36 @@ Avoid this backend if... """ -import collate.icu._ucol import collate._abcollator import collate._locale import collate.errors +from collate.icu import _icu + class Collator(collate._abcollator.Collator): """ICU-based collation.""" def __init__(self, locale, encoding=None): locale, encoding = collate._locale.getpair(locale, encoding) - self._collator = collate.icu._ucol.Collator(locale) + icu_locale = "root" if locale == "C" else locale + self._collator = _icu.Collator(icu_locale) self.locale = self._collator.locale self.encoding = collate._locale.encoding(encoding) if self._collator.used_default_information and locale != "C": raise collate.errors.InvalidLocaleError(locale) + try: + self._breaker = _icu.WordBreaker(icu_locale) + except ValueError: + # Thai is the only language with a special break locale, + # so this is a harmless error. + self._breaker = _icu.WordBreaker("root") + + def words(self, string): + if isinstance(string, str): + string = string.decode(self.encoding, 'replace') + return filter(lambda u: not u.isspace(), self._breaker.words(string)) + def key(self, string): """Sort key for a string. diff --git a/collate/icu/_icu.pyx b/collate/icu/_icu.pyx new file mode 100644 index 0000000..c3b1484 --- /dev/null +++ b/collate/icu/_icu.pyx @@ -0,0 +1,276 @@ +############################################################################## +# +# Copyright (c) 2004 Zope Corporation and Contributors. +# All Rights Reserved. +# +# This software is subject to the provisions of the Zope Public License, +# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution. +# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED +# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS +# FOR A PARTICULAR PURPOSE. +# +############################################################################## +"""Simple wrapper for ICU ucol API + +""" + +import sys + +cdef extern from "unicode/utypes.h": + + cdef enum UErrorCode: + U_USING_DEFAULT_WARNING = -127 + U_USING_FALLBACK_WARNING = -128 + U_ZERO_ERROR = 0 + U_ILLEGAL_ARGUMENT_ERROR = 1 + ctypedef int int32_t + ctypedef char uint8_t + int U_FAILURE(UErrorCode status) + +cdef extern from "unicode/utf.h": + + ctypedef int UChar + ctypedef int UChar32 + +cdef extern from "unicode/ustring.h": + + UChar *u_strFromUTF32(UChar *dest, int32_t destCapacity, + int32_t *pDestLength, + UChar32 *src, int32_t srcLength, + UErrorCode *status) + +cdef extern from "unicode/ucol.h": + + ctypedef struct UCollator: + pass + UCollator *ucol_open(char *locale, UErrorCode *status) + void ucol_close(UCollator *collator) + int32_t ucol_getSortKey(UCollator *coll, + UChar *source, int32_t sourceLength, + uint8_t *result, + int32_t resultLength + ) + int ucol_strcoll(UCollator *coll, + UChar *source, int32_t sourceLength, + UChar *target, int32_t targetLength) + +cdef extern from "unicode/ubrk.h": + cdef enum UBreakIteratorType: + UBRK_CHARACTER = 0 + UBRK_WORD = 1 + UBRK_LINE = 2 + UBRK_SENTENCE = 3 + # UBRK_TITLE = 4 # Deprecated + UBRK_COUNT = 5 + + DEF UBRK_DONE = ((int)(-1)) + + ctypedef struct UBreakIterator: + pass + + UBreakIterator *ubrk_open(UBreakIteratorType type, + char *locale, + UChar *text, + int32_t textLength, + UErrorCode *status) + void ubrk_close(UBreakIterator *bi) + void ubrk_setText(UBreakIterator *bi, + UChar *text, + int32_t textLength, + UErrorCode *status) + int32_t ubrk_current(UBreakIterator *bi) + int32_t ubrk_next(UBreakIterator *bi) + int32_t ubrk_previous(UBreakIterator *bi) + int32_t ubrk_first(UBreakIterator *bi) + int32_t ubrk_last(UBreakIterator *bi) + int32_t ubrk_preceding(UBreakIterator *bi, int32_t offset) + int32_t ubrk_following(UBreakIterator *bi, int32_t offset) + + +cdef extern from "Python.h": + + int PyUnicode_Check(ob) + int PyString_Check(ob) + + ctypedef int Py_UNICODE + Py_UNICODE *PyUnicode_AS_UNICODE(ob) + int PyUnicode_GET_SIZE(ob) + char *PyString_AS_STRING(ob) + + void *PyMem_Malloc(int size) + void PyMem_Free(void *p) + object PyString_FromStringAndSize(char *v, int l) + + +cdef class UCharString: + """Wrapper for ICU UChar arrays + """ + + cdef UChar *data + cdef readonly int32_t length + cdef readonly object base + cdef readonly int need_to_free + + def __cinit__(self, text): + cdef int32_t buffsize + cdef UErrorCode status + cdef Py_UNICODE *str + cdef int length + + if not PyUnicode_Check(text): + if PyString_Check(text): + text = unicode(text) + assert PyUnicode_Check(text) + else: + raise TypeError("Expected unicode string") + + length = PyUnicode_GET_SIZE(text) + str = PyUnicode_AS_UNICODE(text) + + + if sizeof(Py_UNICODE) == 2: + self.data = str + self.length = length + self.base = text + self.need_to_free = 0 + else: + buffsize = 2*length + 1 + self.data = PyMem_Malloc(buffsize*sizeof(UChar)) + if self.data == NULL: + raise MemoryError + status = U_ZERO_ERROR + u_strFromUTF32(self.data, buffsize, &(self.length), + str, length, &status) + assert self.length <= buffsize + self.need_to_free = 1 + if U_FAILURE(status): + raise ValueError( + "Couldn't convert Python unicode data to ICU unicode data." + ) + + def __dealloc__(self): + if self.need_to_free and self.data != NULL: + PyMem_Free(self.data) + self.data = NULL + + +cdef class Collator: + """Compute a collation key for a unicode string. + """ + + cdef UCollator *collator + cdef readonly object locale + cdef readonly int used_default_information + + def __cinit__(self, locale): + cdef UCollator *collator + cdef UErrorCode status + + if not PyString_Check(locale): + raise TypeError("String locale expected") + + status = U_ZERO_ERROR + collator = ucol_open(PyString_AS_STRING(locale), &status) + if U_FAILURE(status): + raise ValueError("Couldn't create a collator") + self.collator = collator + self.locale = locale + if (status == U_USING_DEFAULT_WARNING + or status == U_USING_FALLBACK_WARNING): + status = U_ILLEGAL_ARGUMENT_ERROR + self.used_default_information = status + + def __dealloc__(self): + if self.collator != NULL: + ucol_close(self.collator) + + def key(self, text): + """Compute a collation key for the given unicode text. + + Of course, the key is only valid for the given locale. + """ + cdef char *buffer + cdef int32_t bufsize + cdef int32_t size + + icutext = UCharString(text) + bufsize = (icutext).length*2+10 + + # the +1 below is needed to avoid an apprent buffer overflow bug in ICU + buffer = PyMem_Malloc(bufsize +1) + if buffer == NULL: + raise MemoryError + size = ucol_getSortKey(self.collator, + (icutext).data, + (icutext).length, + buffer, bufsize) + while size > bufsize: + bufsize = size + PyMem_Free(buffer) + buffer = PyMem_Malloc(bufsize +1) # See above +1 + if buffer == NULL: + raise MemoryError + size = ucol_getSortKey(self.collator, + (icutext).data, + (icutext).length, + buffer, bufsize) + + result = PyString_FromStringAndSize(buffer, size) + PyMem_Free(buffer) + return result + + def cmp(self, o1, o2): + u1 = UCharString(o1) + u2 = UCharString(o2) + return ucol_strcoll( + self.collator, + (u1).data, + (u1).length, + (u2).data, + (u2).length, + ) + +cdef class WordBreaker: + cdef UBreakIterator *breaker + cdef readonly object locale + + def __cinit__(self, locale): + cdef UBreakIterator *breaker + cdef UErrorCode status + cdef char *clocale + status = U_ZERO_ERROR + clocale = PyString_AS_STRING(locale) + breaker = ubrk_open(UBRK_WORD, clocale, NULL, 0, &status) + if U_FAILURE(status): + raise ValueError("Couldn't create a breaker") + if ((status == U_USING_DEFAULT_WARNING + or status == U_USING_FALLBACK_WARNING) + and locale != "root" and locale != "C"): + raise ValueError("Invalid locale %s" % locale) + self.breaker = breaker + self.locale = locale + + def words(self, string): + cdef UErrorCode status + cdef UCharString uni + status = U_ZERO_ERROR + uni = UCharString(string) + ubrk_setText(self.breaker, + (uni).data, + (uni).length, &status) + if U_FAILURE(status): + raise ValueError("Couldn't set text to %s: %d" % (string, status)) + p = ubrk_first(self.breaker) + words = [] + while p != UBRK_DONE: + n = ubrk_next(self.breaker) + if p != n and n != UBRK_DONE: + words.append(string[p:n]) + p = n + return words + + def __dealloc__(self): + if self.breaker != NULL: + ubrk_close(self.breaker) + diff --git a/collate/icu/_ucol.pyx b/collate/icu/_ucol.pyx deleted file mode 100644 index e54dd53..0000000 --- a/collate/icu/_ucol.pyx +++ /dev/null @@ -1,199 +0,0 @@ -############################################################################## -# -# Copyright (c) 2004 Zope Corporation and Contributors. -# All Rights Reserved. -# -# This software is subject to the provisions of the Zope Public License, -# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution. -# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED -# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS -# FOR A PARTICULAR PURPOSE. -# -############################################################################## -"""Simple wrapper for ICU ucol API - -""" - -import sys - -cdef extern from "unicode/utypes.h": - - cdef enum UErrorCode: - U_USING_DEFAULT_WARNING = -127 - U_USING_FALLBACK_WARNING = -128 - U_ZERO_ERROR = 0 - U_ILLEGAL_ARGUMENT_ERROR = 1 - ctypedef int int32_t - ctypedef char uint8_t - int U_FAILURE(UErrorCode status) - -cdef extern from "unicode/utf.h": - - ctypedef int UChar - ctypedef int UChar32 - -cdef extern from "unicode/ustring.h": - - UChar *u_strFromUTF32(UChar *dest, int32_t destCapacity, - int32_t *pDestLength, - UChar32 *src, int32_t srcLength, - UErrorCode *status) - -cdef extern from "unicode/ucol.h": - - ctypedef struct UCollator: - pass - UCollator *ucol_open(char *locale, UErrorCode *status) - void ucol_close(UCollator *collator) - int32_t ucol_getSortKey(UCollator *coll, - UChar *source, int32_t sourceLength, - uint8_t *result, - int32_t resultLength - ) - int ucol_strcoll(UCollator *coll, - UChar *source, int32_t sourceLength, - UChar *target, int32_t targetLength) - -cdef extern from "Python.h": - - int PyUnicode_Check(ob) - int PyString_Check(ob) - - ctypedef int Py_UNICODE - Py_UNICODE *PyUnicode_AS_UNICODE(ob) - int PyUnicode_GET_SIZE(ob) - char *PyString_AS_STRING(ob) - - void *PyMem_Malloc(int size) - void PyMem_Free(void *p) - object PyString_FromStringAndSize(char *v, int l) - - -cdef class UCharString: - """Wrapper for ICU UChar arrays - """ - - cdef UChar *data - cdef readonly int32_t length - cdef readonly object base - cdef readonly int need_to_free - - def __cinit__(self, text): - cdef int32_t buffsize - cdef UErrorCode status - cdef Py_UNICODE *str - cdef int length - - if not PyUnicode_Check(text): - if PyString_Check(text): - text = unicode(text) - assert PyUnicode_Check(text) - else: - raise TypeError("Expected unicode string") - - length = PyUnicode_GET_SIZE(text) - str = PyUnicode_AS_UNICODE(text) - - - if sizeof(Py_UNICODE) == 2: - self.data = str - self.length = length - self.base = text - self.need_to_free = 0 - else: - buffsize = 2*length + 1 - self.data = PyMem_Malloc(buffsize*sizeof(UChar)) - if self.data == NULL: - raise MemoryError - status = U_ZERO_ERROR - u_strFromUTF32(self.data, buffsize, &(self.length), - str, length, &status) - assert self.length <= buffsize - self.need_to_free = 1 - if U_FAILURE(status): - raise ValueError( - "Couldn't convert Python unicode data to ICU unicode data." - ) - - def __dealloc__(self): - if self.need_to_free and self.data != NULL: - PyMem_Free(self.data) - self.data = NULL - - -cdef class Collator: - """Compute a collation key for a unicode string. - """ - - cdef UCollator *collator - cdef readonly object locale - cdef readonly int used_default_information - - def __cinit__(self, locale): - cdef UCollator *collator - cdef UErrorCode status - - if not PyString_Check(locale): - raise TypeError("String locale expected") - - status = U_ZERO_ERROR - collator = ucol_open(PyString_AS_STRING(locale), &status) - if U_FAILURE(status): - raise ValueError("Couldn't create a collator") - self.collator = collator - self.locale = locale - if (status == U_USING_DEFAULT_WARNING - or status == U_USING_FALLBACK_WARNING): - status = U_ILLEGAL_ARGUMENT_ERROR - self.used_default_information = status - - def __dealloc__(self): - if self.collator != NULL: - ucol_close(self.collator) - - def key(self, text): - """Compute a collation key for the given unicode text. - - Of course, the key is only valid for the given locale. - """ - cdef char *buffer - cdef int32_t bufsize - cdef int32_t size - - icutext = UCharString(text) - bufsize = (icutext).length*2+10 - - # the +1 below is needed to avoid an apprent buffer overflow bug in ICU - buffer = PyMem_Malloc(bufsize +1) - if buffer == NULL: - raise MemoryError - size = ucol_getSortKey(self.collator, - (icutext).data, - (icutext).length, - buffer, bufsize) - while size > bufsize: - bufsize = size - PyMem_Free(buffer) - buffer = PyMem_Malloc(bufsize +1) # See above +1 - if buffer == NULL: - raise MemoryError - size = ucol_getSortKey(self.collator, - (icutext).data, - (icutext).length, - buffer, bufsize) - - result = PyString_FromStringAndSize(buffer, size) - PyMem_Free(buffer) - return result - - def cmp(self, o1, o2): - u1 = UCharString(o1) - u2 = UCharString(o2) - return ucol_strcoll( - self.collator, - (u1).data, - (u1).length, - (u2).data, - (u2).length, - ) diff --git a/collate/syslocale.py b/collate/syslocale.py index 1ee2924..e2aeed9 100644 --- a/collate/syslocale.py +++ b/collate/syslocale.py @@ -12,11 +12,10 @@ locale of all previous collators and anything else using the system locale information. Use this collation backend if... - - You are on a system without ICU or UCA datafiles for the locale, - and DUCET results are not acceptable. + - You are on a system without ICU. Avoid this backend if... - - ICU or UCA support is available for the current locale. + - ICU is available for the current locale. - You are sorting strings from alphabets outside the primary locale. - You need to support collating multiple locales at once. - You need the same results across multiple platforms. @@ -24,6 +23,7 @@ Avoid this backend if... """ import locale +import re import collate.errors import collate._abcollator @@ -66,3 +66,10 @@ class Collator(collate._abcollator.Collator): b = b.decode(self.encoding, "replace") return locale.strcoll(a, b) + def words(self, string, sep=re.compile(r"\W+", re.UNICODE)): + """Split the string into separate words. + + This split is done using the locale's notion of a word boundry. + """ + return re.split(sep, string) + diff --git a/setup.py b/setup.py index cf14037..111cfac 100755 --- a/setup.py +++ b/setup.py @@ -17,9 +17,9 @@ setup(name='collate', description="Python text collation", license="MIT / ZPL 2.1", ext_modules=[ - Extension('collate.icu._ucol', - ['collate/icu/_ucol.pyx'], + Extension('collate.icu._icu', + ['collate/icu/_icu.pyx'], libraries=libraries)], cmdclass=dict(build_ext=build_ext), - packages=["collate", "collate.icu", "collate.uca", "collate.syslocale"], + packages=["collate", "collate.icu"], )