From f73c4c6cd3ed326c5735ab33a6896697227d07e3 Mon Sep 17 00:00:00 2001 From: Joe Wreschnig Date: Mon, 15 Feb 2010 01:20:21 -0800 Subject: [PATCH 1/1] Word-splitting. --- collate/_abcollator.py | 7 +++ collate/icu/__init__.py | 18 +++++- collate/icu/{_ucol.pyx => _icu.pyx} | 87 +++++++++++++++++++++++++++-- collate/syslocale.py | 13 ++++- setup.py | 6 +- 5 files changed, 118 insertions(+), 13 deletions(-) rename collate/icu/{_ucol.pyx => _icu.pyx} (69%) diff --git a/collate/_abcollator.py b/collate/_abcollator.py index 71f5f54..99866c3 100644 --- a/collate/_abcollator.py +++ b/collate/_abcollator.py @@ -2,3 +2,10 @@ class Collator(object): def cmp(self, string1, string2): """Return negative if a < b, zero if a == b, positive if a > b.""" return cmp(self.key(string1), self.key(string2)) + + def words(self, string): + """Split the string into separate words. + + This split is done using Unicode's definition of whitespace. + """ + return string.split() diff --git a/collate/icu/__init__.py b/collate/icu/__init__.py index eeee418..00a1538 100644 --- a/collate/icu/__init__.py +++ b/collate/icu/__init__.py @@ -11,22 +11,36 @@ Avoid this backend if... """ -import collate.icu._ucol import collate._abcollator import collate._locale import collate.errors +from collate.icu import _icu + class Collator(collate._abcollator.Collator): """ICU-based collation.""" def __init__(self, locale, encoding=None): locale, encoding = collate._locale.getpair(locale, encoding) - self._collator = collate.icu._ucol.Collator(locale) + icu_locale = "root" if locale == "C" else locale + self._collator = _icu.Collator(icu_locale) self.locale = self._collator.locale self.encoding = collate._locale.encoding(encoding) if self._collator.used_default_information and locale != "C": raise collate.errors.InvalidLocaleError(locale) + try: + self._breaker = _icu.WordBreaker(icu_locale) + except ValueError: + # Thai is the only language with a special break locale, + # so this is a harmless error. + self._breaker = _icu.WordBreaker("root") + + def words(self, string): + if isinstance(string, str): + string = string.decode(self.encoding, 'replace') + return filter(lambda u: not u.isspace(), self._breaker.words(string)) + def key(self, string): """Sort key for a string. diff --git a/collate/icu/_ucol.pyx b/collate/icu/_icu.pyx similarity index 69% rename from collate/icu/_ucol.pyx rename to collate/icu/_icu.pyx index e54dd53..c3b1484 100644 --- a/collate/icu/_ucol.pyx +++ b/collate/icu/_icu.pyx @@ -17,7 +17,7 @@ import sys -cdef extern from "unicode/utypes.h": +cdef extern from "unicode/utypes.h": cdef enum UErrorCode: U_USING_DEFAULT_WARNING = -127 @@ -28,19 +28,19 @@ cdef extern from "unicode/utypes.h": ctypedef char uint8_t int U_FAILURE(UErrorCode status) -cdef extern from "unicode/utf.h": +cdef extern from "unicode/utf.h": ctypedef int UChar ctypedef int UChar32 -cdef extern from "unicode/ustring.h": +cdef extern from "unicode/ustring.h": UChar *u_strFromUTF32(UChar *dest, int32_t destCapacity, int32_t *pDestLength, UChar32 *src, int32_t srcLength, UErrorCode *status) -cdef extern from "unicode/ucol.h": +cdef extern from "unicode/ucol.h": ctypedef struct UCollator: pass @@ -55,7 +55,40 @@ cdef extern from "unicode/ucol.h": UChar *source, int32_t sourceLength, UChar *target, int32_t targetLength) -cdef extern from "Python.h": +cdef extern from "unicode/ubrk.h": + cdef enum UBreakIteratorType: + UBRK_CHARACTER = 0 + UBRK_WORD = 1 + UBRK_LINE = 2 + UBRK_SENTENCE = 3 + # UBRK_TITLE = 4 # Deprecated + UBRK_COUNT = 5 + + DEF UBRK_DONE = ((int)(-1)) + + ctypedef struct UBreakIterator: + pass + + UBreakIterator *ubrk_open(UBreakIteratorType type, + char *locale, + UChar *text, + int32_t textLength, + UErrorCode *status) + void ubrk_close(UBreakIterator *bi) + void ubrk_setText(UBreakIterator *bi, + UChar *text, + int32_t textLength, + UErrorCode *status) + int32_t ubrk_current(UBreakIterator *bi) + int32_t ubrk_next(UBreakIterator *bi) + int32_t ubrk_previous(UBreakIterator *bi) + int32_t ubrk_first(UBreakIterator *bi) + int32_t ubrk_last(UBreakIterator *bi) + int32_t ubrk_preceding(UBreakIterator *bi, int32_t offset) + int32_t ubrk_following(UBreakIterator *bi, int32_t offset) + + +cdef extern from "Python.h": int PyUnicode_Check(ob) int PyString_Check(ob) @@ -197,3 +230,47 @@ cdef class Collator: (u2).data, (u2).length, ) + +cdef class WordBreaker: + cdef UBreakIterator *breaker + cdef readonly object locale + + def __cinit__(self, locale): + cdef UBreakIterator *breaker + cdef UErrorCode status + cdef char *clocale + status = U_ZERO_ERROR + clocale = PyString_AS_STRING(locale) + breaker = ubrk_open(UBRK_WORD, clocale, NULL, 0, &status) + if U_FAILURE(status): + raise ValueError("Couldn't create a breaker") + if ((status == U_USING_DEFAULT_WARNING + or status == U_USING_FALLBACK_WARNING) + and locale != "root" and locale != "C"): + raise ValueError("Invalid locale %s" % locale) + self.breaker = breaker + self.locale = locale + + def words(self, string): + cdef UErrorCode status + cdef UCharString uni + status = U_ZERO_ERROR + uni = UCharString(string) + ubrk_setText(self.breaker, + (uni).data, + (uni).length, &status) + if U_FAILURE(status): + raise ValueError("Couldn't set text to %s: %d" % (string, status)) + p = ubrk_first(self.breaker) + words = [] + while p != UBRK_DONE: + n = ubrk_next(self.breaker) + if p != n and n != UBRK_DONE: + words.append(string[p:n]) + p = n + return words + + def __dealloc__(self): + if self.breaker != NULL: + ubrk_close(self.breaker) + diff --git a/collate/syslocale.py b/collate/syslocale.py index 1ee2924..e2aeed9 100644 --- a/collate/syslocale.py +++ b/collate/syslocale.py @@ -12,11 +12,10 @@ locale of all previous collators and anything else using the system locale information. Use this collation backend if... - - You are on a system without ICU or UCA datafiles for the locale, - and DUCET results are not acceptable. + - You are on a system without ICU. Avoid this backend if... - - ICU or UCA support is available for the current locale. + - ICU is available for the current locale. - You are sorting strings from alphabets outside the primary locale. - You need to support collating multiple locales at once. - You need the same results across multiple platforms. @@ -24,6 +23,7 @@ Avoid this backend if... """ import locale +import re import collate.errors import collate._abcollator @@ -66,3 +66,10 @@ class Collator(collate._abcollator.Collator): b = b.decode(self.encoding, "replace") return locale.strcoll(a, b) + def words(self, string, sep=re.compile(r"\W+", re.UNICODE)): + """Split the string into separate words. + + This split is done using the locale's notion of a word boundry. + """ + return re.split(sep, string) + diff --git a/setup.py b/setup.py index cf14037..111cfac 100755 --- a/setup.py +++ b/setup.py @@ -17,9 +17,9 @@ setup(name='collate', description="Python text collation", license="MIT / ZPL 2.1", ext_modules=[ - Extension('collate.icu._ucol', - ['collate/icu/_ucol.pyx'], + Extension('collate.icu._icu', + ['collate/icu/_icu.pyx'], libraries=libraries)], cmdclass=dict(build_ext=build_ext), - packages=["collate", "collate.icu", "collate.uca", "collate.syslocale"], + packages=["collate", "collate.icu"], ) -- 2.20.1