Word-splitting.
authorJoe Wreschnig <joe.wreschnig@gmail.com>
Mon, 15 Feb 2010 09:20:21 +0000 (01:20 -0800)
committerJoe Wreschnig <joe.wreschnig@gmail.com>
Mon, 15 Feb 2010 09:20:21 +0000 (01:20 -0800)
collate/_abcollator.py
collate/icu/__init__.py
collate/icu/_icu.pyx [new file with mode: 0644]
collate/icu/_ucol.pyx [deleted file]
collate/syslocale.py
setup.py

index 71f5f54..99866c3 100644 (file)
@@ -2,3 +2,10 @@ class Collator(object):
     def cmp(self, string1, string2):
         """Return negative if a < b, zero if a == b, positive if a > b."""
         return cmp(self.key(string1), self.key(string2))
+
+    def words(self, string):
+        """Split the string into separate words.
+
+        This split is done using Unicode's definition of whitespace.
+        """
+        return string.split()
index eeee418..00a1538 100644 (file)
@@ -11,22 +11,36 @@ Avoid this backend if...
 
 """
 
-import collate.icu._ucol
 import collate._abcollator
 import collate._locale
 import collate.errors
 
+from collate.icu import _icu
+
 class Collator(collate._abcollator.Collator):
     """ICU-based collation."""
 
     def __init__(self, locale, encoding=None):
         locale, encoding = collate._locale.getpair(locale, encoding)
-        self._collator = collate.icu._ucol.Collator(locale)
+        icu_locale = "root" if locale == "C" else locale
+        self._collator = _icu.Collator(icu_locale)
         self.locale = self._collator.locale
         self.encoding = collate._locale.encoding(encoding)
         if self._collator.used_default_information and locale != "C":
             raise collate.errors.InvalidLocaleError(locale)
 
+        try:
+            self._breaker = _icu.WordBreaker(icu_locale)
+        except ValueError:
+            # Thai is the only language with a special break locale,
+            # so this is a harmless error.
+            self._breaker = _icu.WordBreaker("root")
+
+    def words(self, string):
+        if isinstance(string, str):
+            string = string.decode(self.encoding, 'replace')
+        return filter(lambda u: not u.isspace(), self._breaker.words(string))
+
     def key(self, string):
         """Sort key for a string.
 
diff --git a/collate/icu/_icu.pyx b/collate/icu/_icu.pyx
new file mode 100644 (file)
index 0000000..c3b1484
--- /dev/null
@@ -0,0 +1,276 @@
+##############################################################################
+#
+# Copyright (c) 2004 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+"""Simple wrapper for ICU ucol API
+
+"""
+
+import sys
+
+cdef extern from "unicode/utypes.h":
+
+    cdef enum UErrorCode:
+        U_USING_DEFAULT_WARNING = -127
+        U_USING_FALLBACK_WARNING = -128
+        U_ZERO_ERROR = 0
+        U_ILLEGAL_ARGUMENT_ERROR = 1
+    ctypedef int int32_t
+    ctypedef char uint8_t
+    int U_FAILURE(UErrorCode status)
+
+cdef extern from "unicode/utf.h":
+
+    ctypedef int UChar
+    ctypedef int UChar32
+
+cdef extern from "unicode/ustring.h":
+    
+    UChar *u_strFromUTF32(UChar *dest, int32_t destCapacity,
+                          int32_t *pDestLength,
+                          UChar32 *src, int32_t srcLength,
+                          UErrorCode *status)
+
+cdef extern from "unicode/ucol.h":
+
+    ctypedef struct UCollator:
+        pass
+    UCollator *ucol_open(char *locale, UErrorCode *status)
+    void ucol_close(UCollator *collator)
+    int32_t ucol_getSortKey(UCollator *coll,
+                            UChar *source, int32_t sourceLength,
+                            uint8_t *result,
+                            int32_t resultLength
+                            )
+    int ucol_strcoll(UCollator *coll,
+                     UChar *source, int32_t sourceLength,
+                     UChar *target, int32_t targetLength)
+
+cdef extern from "unicode/ubrk.h":
+    cdef enum UBreakIteratorType:
+        UBRK_CHARACTER = 0
+        UBRK_WORD = 1
+        UBRK_LINE = 2
+        UBRK_SENTENCE = 3
+        # UBRK_TITLE = 4 # Deprecated
+        UBRK_COUNT = 5
+
+    DEF UBRK_DONE = ((int)(-1))
+
+    ctypedef struct UBreakIterator:
+        pass
+
+    UBreakIterator *ubrk_open(UBreakIteratorType type,
+                              char *locale,
+                              UChar *text,
+                              int32_t textLength,
+                              UErrorCode *status)
+    void ubrk_close(UBreakIterator *bi)
+    void ubrk_setText(UBreakIterator *bi,
+                      UChar *text,
+                      int32_t textLength,
+                      UErrorCode *status)
+    int32_t ubrk_current(UBreakIterator *bi)
+    int32_t ubrk_next(UBreakIterator *bi)
+    int32_t ubrk_previous(UBreakIterator *bi)
+    int32_t ubrk_first(UBreakIterator *bi)
+    int32_t ubrk_last(UBreakIterator *bi)
+    int32_t ubrk_preceding(UBreakIterator *bi, int32_t offset)
+    int32_t ubrk_following(UBreakIterator *bi, int32_t offset)
+
+
+cdef extern from "Python.h":
+
+    int PyUnicode_Check(ob)
+    int PyString_Check(ob)
+
+    ctypedef int Py_UNICODE
+    Py_UNICODE *PyUnicode_AS_UNICODE(ob)
+    int PyUnicode_GET_SIZE(ob)
+    char *PyString_AS_STRING(ob)
+
+    void *PyMem_Malloc(int size)
+    void PyMem_Free(void *p)
+    object PyString_FromStringAndSize(char *v, int l)
+    
+    
+cdef class UCharString:
+    """Wrapper for ICU UChar arrays
+    """
+
+    cdef UChar *data
+    cdef readonly int32_t length
+    cdef readonly object base
+    cdef readonly int need_to_free
+
+    def __cinit__(self, text):
+        cdef int32_t buffsize
+        cdef UErrorCode status
+        cdef Py_UNICODE *str
+        cdef int length
+
+        if not PyUnicode_Check(text):
+            if PyString_Check(text):
+                text = unicode(text)
+                assert PyUnicode_Check(text)
+            else:
+                raise TypeError("Expected unicode string")
+
+        length = PyUnicode_GET_SIZE(text)
+        str = PyUnicode_AS_UNICODE(text)
+        
+
+        if sizeof(Py_UNICODE) == 2:
+            self.data = str
+            self.length = length
+            self.base = text
+            self.need_to_free = 0
+        else:
+            buffsize = 2*length + 1
+            self.data = <UChar*>PyMem_Malloc(buffsize*sizeof(UChar))
+            if self.data == NULL:
+                raise MemoryError
+            status = U_ZERO_ERROR
+            u_strFromUTF32(self.data, buffsize, &(self.length),
+                           <UChar32*>str, length, &status)
+            assert self.length <= buffsize
+            self.need_to_free = 1
+            if U_FAILURE(status):
+                raise ValueError(
+                    "Couldn't convert Python unicode data to ICU unicode data."
+                    )
+
+    def __dealloc__(self):
+        if self.need_to_free and self.data != NULL:
+            PyMem_Free(self.data)
+            self.data = NULL
+
+
+cdef class Collator:
+    """Compute a collation key for a unicode string.
+    """
+
+    cdef UCollator *collator
+    cdef readonly object locale
+    cdef readonly int used_default_information
+
+    def __cinit__(self, locale):
+        cdef UCollator *collator
+        cdef UErrorCode status
+
+        if not PyString_Check(locale):
+            raise TypeError("String locale expected")
+        
+        status = U_ZERO_ERROR
+        collator = ucol_open(PyString_AS_STRING(locale), &status)
+        if U_FAILURE(status):
+            raise ValueError("Couldn't create a collator")
+        self.collator = collator
+        self.locale = locale
+        if (status == U_USING_DEFAULT_WARNING
+            or status == U_USING_FALLBACK_WARNING):
+            status = U_ILLEGAL_ARGUMENT_ERROR
+        self.used_default_information = status
+
+    def __dealloc__(self):
+        if self.collator != NULL:
+            ucol_close(self.collator)
+
+    def key(self, text):
+        """Compute a collation key for the given unicode text.
+
+        Of course, the key is only valid for the given locale.
+        """
+        cdef char *buffer
+        cdef int32_t bufsize
+        cdef int32_t size
+
+        icutext = UCharString(text)
+        bufsize = (<UCharString>icutext).length*2+10
+
+        # the +1 below is needed to avoid an apprent buffer overflow bug in ICU
+        buffer = <char*>PyMem_Malloc(bufsize +1)
+        if buffer == NULL:
+            raise MemoryError
+        size = ucol_getSortKey(self.collator,
+                               (<UCharString>icutext).data,
+                               (<UCharString>icutext).length,
+                               <uint8_t*>buffer, bufsize)
+        while size > bufsize:
+            bufsize = size
+            PyMem_Free(buffer)
+            buffer = <char*>PyMem_Malloc(bufsize +1) # See above +1
+            if buffer == NULL:
+                raise MemoryError
+            size = ucol_getSortKey(self.collator,
+                                   (<UCharString>icutext).data,
+                                   (<UCharString>icutext).length,
+                                   <uint8_t*>buffer, bufsize)
+
+        result = PyString_FromStringAndSize(buffer, size)
+        PyMem_Free(buffer)
+        return result
+
+    def cmp(self, o1, o2):
+        u1 = UCharString(o1)
+        u2 = UCharString(o2)
+        return ucol_strcoll(
+            self.collator,
+            (<UCharString>u1).data,
+            (<UCharString>u1).length,
+            (<UCharString>u2).data,
+            (<UCharString>u2).length,
+            )
+
+cdef class WordBreaker:
+    cdef UBreakIterator *breaker
+    cdef readonly object locale
+
+    def __cinit__(self, locale):
+        cdef UBreakIterator *breaker
+        cdef UErrorCode status
+        cdef char *clocale
+        status = U_ZERO_ERROR
+        clocale = PyString_AS_STRING(locale)
+        breaker = ubrk_open(UBRK_WORD, clocale, NULL, 0, &status)
+        if U_FAILURE(status):
+            raise ValueError("Couldn't create a breaker")
+        if ((status == U_USING_DEFAULT_WARNING
+             or status == U_USING_FALLBACK_WARNING)
+            and locale != "root" and locale != "C"):
+            raise ValueError("Invalid locale %s" % locale)
+        self.breaker = breaker
+        self.locale = locale
+
+    def words(self, string):
+        cdef UErrorCode status
+        cdef UCharString uni
+        status = U_ZERO_ERROR
+        uni = UCharString(string)
+        ubrk_setText(self.breaker,
+                     (<UCharString>uni).data,
+                     (<UCharString>uni).length, &status)
+        if U_FAILURE(status):
+            raise ValueError("Couldn't set text to %s: %d" % (string, status))
+        p = ubrk_first(self.breaker)
+        words = []
+        while p != UBRK_DONE:
+            n = ubrk_next(self.breaker)
+            if p != n and n != UBRK_DONE:
+                words.append(string[p:n])
+            p = n
+        return words
+
+    def __dealloc__(self):
+        if self.breaker != NULL:
+            ubrk_close(self.breaker)
+
diff --git a/collate/icu/_ucol.pyx b/collate/icu/_ucol.pyx
deleted file mode 100644 (file)
index e54dd53..0000000
+++ /dev/null
@@ -1,199 +0,0 @@
-##############################################################################
-#
-# Copyright (c) 2004 Zope Corporation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-"""Simple wrapper for ICU ucol API
-
-"""
-
-import sys
-
-cdef extern from  "unicode/utypes.h":
-
-    cdef enum UErrorCode:
-        U_USING_DEFAULT_WARNING = -127
-        U_USING_FALLBACK_WARNING = -128
-        U_ZERO_ERROR = 0
-        U_ILLEGAL_ARGUMENT_ERROR = 1
-    ctypedef int int32_t
-    ctypedef char uint8_t
-    int U_FAILURE(UErrorCode status)
-
-cdef extern from  "unicode/utf.h":
-
-    ctypedef int UChar
-    ctypedef int UChar32
-
-cdef extern from  "unicode/ustring.h":
-    
-    UChar *u_strFromUTF32(UChar *dest, int32_t destCapacity,
-                          int32_t *pDestLength,
-                          UChar32 *src, int32_t srcLength,
-                          UErrorCode *status)
-
-cdef extern from  "unicode/ucol.h":
-
-    ctypedef struct UCollator:
-        pass
-    UCollator *ucol_open(char *locale, UErrorCode *status)
-    void ucol_close(UCollator *collator)
-    int32_t ucol_getSortKey(UCollator *coll,
-                            UChar *source, int32_t sourceLength,
-                            uint8_t *result,
-                            int32_t resultLength
-                            )
-    int ucol_strcoll(UCollator *coll,
-                     UChar *source, int32_t sourceLength,
-                     UChar *target, int32_t targetLength)
-
-cdef extern from  "Python.h":
-
-    int PyUnicode_Check(ob)
-    int PyString_Check(ob)
-
-    ctypedef int Py_UNICODE
-    Py_UNICODE *PyUnicode_AS_UNICODE(ob)
-    int PyUnicode_GET_SIZE(ob)
-    char *PyString_AS_STRING(ob)
-
-    void *PyMem_Malloc(int size)
-    void PyMem_Free(void *p)
-    object PyString_FromStringAndSize(char *v, int l)
-    
-    
-cdef class UCharString:
-    """Wrapper for ICU UChar arrays
-    """
-
-    cdef UChar *data
-    cdef readonly int32_t length
-    cdef readonly object base
-    cdef readonly int need_to_free
-
-    def __cinit__(self, text):
-        cdef int32_t buffsize
-        cdef UErrorCode status
-        cdef Py_UNICODE *str
-        cdef int length
-
-        if not PyUnicode_Check(text):
-            if PyString_Check(text):
-                text = unicode(text)
-                assert PyUnicode_Check(text)
-            else:
-                raise TypeError("Expected unicode string")
-
-        length = PyUnicode_GET_SIZE(text)
-        str = PyUnicode_AS_UNICODE(text)
-        
-
-        if sizeof(Py_UNICODE) == 2:
-            self.data = str
-            self.length = length
-            self.base = text
-            self.need_to_free = 0
-        else:
-            buffsize = 2*length + 1
-            self.data = <UChar*>PyMem_Malloc(buffsize*sizeof(UChar))
-            if self.data == NULL:
-                raise MemoryError
-            status = U_ZERO_ERROR
-            u_strFromUTF32(self.data, buffsize, &(self.length),
-                           <UChar32*>str, length, &status)
-            assert self.length <= buffsize
-            self.need_to_free = 1
-            if U_FAILURE(status):
-                raise ValueError(
-                    "Couldn't convert Python unicode data to ICU unicode data."
-                    )
-
-    def __dealloc__(self):
-        if self.need_to_free and self.data != NULL:
-            PyMem_Free(self.data)
-            self.data = NULL
-
-
-cdef class Collator:
-    """Compute a collation key for a unicode string.
-    """
-
-    cdef UCollator *collator
-    cdef readonly object locale
-    cdef readonly int used_default_information
-
-    def __cinit__(self, locale):
-        cdef UCollator *collator
-        cdef UErrorCode status
-
-        if not PyString_Check(locale):
-            raise TypeError("String locale expected")
-        
-        status = U_ZERO_ERROR
-        collator = ucol_open(PyString_AS_STRING(locale), &status)
-        if U_FAILURE(status):
-            raise ValueError("Couldn't create a collator")
-        self.collator = collator
-        self.locale = locale
-        if (status == U_USING_DEFAULT_WARNING
-            or status == U_USING_FALLBACK_WARNING):
-            status = U_ILLEGAL_ARGUMENT_ERROR
-        self.used_default_information = status
-
-    def __dealloc__(self):
-        if self.collator != NULL:
-            ucol_close(self.collator)
-
-    def key(self, text):
-        """Compute a collation key for the given unicode text.
-
-        Of course, the key is only valid for the given locale.
-        """
-        cdef char *buffer
-        cdef int32_t bufsize
-        cdef int32_t size
-
-        icutext = UCharString(text)
-        bufsize = (<UCharString>icutext).length*2+10
-
-        # the +1 below is needed to avoid an apprent buffer overflow bug in ICU
-        buffer = <char*>PyMem_Malloc(bufsize +1)
-        if buffer == NULL:
-            raise MemoryError
-        size = ucol_getSortKey(self.collator,
-                               (<UCharString>icutext).data,
-                               (<UCharString>icutext).length,
-                               <uint8_t*>buffer, bufsize)
-        while size > bufsize:
-            bufsize = size
-            PyMem_Free(buffer)
-            buffer = <char*>PyMem_Malloc(bufsize +1) # See above +1
-            if buffer == NULL:
-                raise MemoryError
-            size = ucol_getSortKey(self.collator,
-                                   (<UCharString>icutext).data,
-                                   (<UCharString>icutext).length,
-                                   <uint8_t*>buffer, bufsize)
-
-        result = PyString_FromStringAndSize(buffer, size)
-        PyMem_Free(buffer)
-        return result
-
-    def cmp(self, o1, o2):
-        u1 = UCharString(o1)
-        u2 = UCharString(o2)
-        return ucol_strcoll(
-            self.collator,
-            (<UCharString>u1).data,
-            (<UCharString>u1).length,
-            (<UCharString>u2).data,
-            (<UCharString>u2).length,
-            )
index 1ee2924..e2aeed9 100644 (file)
@@ -12,11 +12,10 @@ locale of all previous collators and anything else using the system
 locale information.
 
 Use this collation backend if...
- - You are on a system without ICU or UCA datafiles for the locale,
-   and DUCET results are not acceptable.
+ - You are on a system without ICU.
 
 Avoid this backend if...
- - ICU or UCA support is available for the current locale.
+ - ICU is available for the current locale.
  - You are sorting strings from alphabets outside the primary locale.
  - You need to support collating multiple locales at once.
  - You need the same results across multiple platforms.
@@ -24,6 +23,7 @@ Avoid this backend if...
 """
 
 import locale
+import re
 
 import collate.errors
 import collate._abcollator
@@ -66,3 +66,10 @@ class Collator(collate._abcollator.Collator):
             b = b.decode(self.encoding, "replace")
         return locale.strcoll(a, b)
                                   
+    def words(self, string, sep=re.compile(r"\W+", re.UNICODE)):
+        """Split the string into separate words.
+
+        This split is done using the locale's notion of a word boundry.
+        """
+        return re.split(sep, string)
+
index cf14037..111cfac 100755 (executable)
--- a/setup.py
+++ b/setup.py
@@ -17,9 +17,9 @@ setup(name='collate',
       description="Python text collation",
       license="MIT / ZPL 2.1",
       ext_modules=[
-          Extension('collate.icu._ucol',
-                    ['collate/icu/_ucol.pyx'],
+          Extension('collate.icu._icu',
+                    ['collate/icu/_icu.pyx'],
                     libraries=libraries)],
       cmdclass=dict(build_ext=build_ext),
-      packages=["collate", "collate.icu", "collate.uca", "collate.syslocale"],
+      packages=["collate", "collate.icu"],
       )