From f73c4c6cd3ed326c5735ab33a6896697227d07e3 Mon Sep 17 00:00:00 2001
From: Joe Wreschnig <joe.wreschnig@gmail.com>
Date: Mon, 15 Feb 2010 01:20:21 -0800
Subject: [PATCH 1/1] Word-splitting.

---
 collate/_abcollator.py              |  7 +++
 collate/icu/__init__.py             | 18 +++++-
 collate/icu/{_ucol.pyx => _icu.pyx} | 87 +++++++++++++++++++++++++++--
 collate/syslocale.py                | 13 ++++-
 setup.py                            |  6 +-
 5 files changed, 118 insertions(+), 13 deletions(-)
 rename collate/icu/{_ucol.pyx => _icu.pyx} (69%)

diff --git a/collate/_abcollator.py b/collate/_abcollator.py
index 71f5f54..99866c3 100644
--- a/collate/_abcollator.py
+++ b/collate/_abcollator.py
@@ -2,3 +2,10 @@ class Collator(object):
     def cmp(self, string1, string2):
         """Return negative if a < b, zero if a == b, positive if a > b."""
         return cmp(self.key(string1), self.key(string2))
+
+    def words(self, string):
+        """Split the string into separate words.
+
+        This split is done using Unicode's definition of whitespace.
+        """
+        return string.split()
diff --git a/collate/icu/__init__.py b/collate/icu/__init__.py
index eeee418..00a1538 100644
--- a/collate/icu/__init__.py
+++ b/collate/icu/__init__.py
@@ -11,22 +11,36 @@ Avoid this backend if...
 
 """
 
-import collate.icu._ucol
 import collate._abcollator
 import collate._locale
 import collate.errors
 
+from collate.icu import _icu
+
 class Collator(collate._abcollator.Collator):
     """ICU-based collation."""
 
     def __init__(self, locale, encoding=None):
         locale, encoding = collate._locale.getpair(locale, encoding)
-        self._collator = collate.icu._ucol.Collator(locale)
+        icu_locale = "root" if locale == "C" else locale
+        self._collator = _icu.Collator(icu_locale)
         self.locale = self._collator.locale
         self.encoding = collate._locale.encoding(encoding)
         if self._collator.used_default_information and locale != "C":
             raise collate.errors.InvalidLocaleError(locale)
 
+        try:
+            self._breaker = _icu.WordBreaker(icu_locale)
+        except ValueError:
+            # Thai is the only language with a special break locale,
+            # so this is a harmless error.
+            self._breaker = _icu.WordBreaker("root")
+
+    def words(self, string):
+        if isinstance(string, str):
+            string = string.decode(self.encoding, 'replace')
+        return filter(lambda u: not u.isspace(), self._breaker.words(string))
+
     def key(self, string):
         """Sort key for a string.
 
diff --git a/collate/icu/_ucol.pyx b/collate/icu/_icu.pyx
similarity index 69%
rename from collate/icu/_ucol.pyx
rename to collate/icu/_icu.pyx
index e54dd53..c3b1484 100644
--- a/collate/icu/_ucol.pyx
+++ b/collate/icu/_icu.pyx
@@ -17,7 +17,7 @@
 
 import sys
 
-cdef extern from  "unicode/utypes.h":
+cdef extern from "unicode/utypes.h":
 
     cdef enum UErrorCode:
         U_USING_DEFAULT_WARNING = -127
@@ -28,19 +28,19 @@ cdef extern from  "unicode/utypes.h":
     ctypedef char uint8_t
     int U_FAILURE(UErrorCode status)
 
-cdef extern from  "unicode/utf.h":
+cdef extern from "unicode/utf.h":
 
     ctypedef int UChar
     ctypedef int UChar32
 
-cdef extern from  "unicode/ustring.h":
+cdef extern from "unicode/ustring.h":
     
     UChar *u_strFromUTF32(UChar *dest, int32_t destCapacity,
                           int32_t *pDestLength,
                           UChar32 *src, int32_t srcLength,
                           UErrorCode *status)
 
-cdef extern from  "unicode/ucol.h":
+cdef extern from "unicode/ucol.h":
 
     ctypedef struct UCollator:
         pass
@@ -55,7 +55,40 @@ cdef extern from  "unicode/ucol.h":
                      UChar *source, int32_t sourceLength,
                      UChar *target, int32_t targetLength)
 
-cdef extern from  "Python.h":
+cdef extern from "unicode/ubrk.h":
+    cdef enum UBreakIteratorType:
+        UBRK_CHARACTER = 0
+        UBRK_WORD = 1
+        UBRK_LINE = 2
+        UBRK_SENTENCE = 3
+        # UBRK_TITLE = 4 # Deprecated
+        UBRK_COUNT = 5
+
+    DEF UBRK_DONE = ((int)(-1))
+
+    ctypedef struct UBreakIterator:
+        pass
+
+    UBreakIterator *ubrk_open(UBreakIteratorType type,
+                              char *locale,
+                              UChar *text,
+                              int32_t textLength,
+                              UErrorCode *status)
+    void ubrk_close(UBreakIterator *bi)
+    void ubrk_setText(UBreakIterator *bi,
+                      UChar *text,
+                      int32_t textLength,
+                      UErrorCode *status)
+    int32_t ubrk_current(UBreakIterator *bi)
+    int32_t ubrk_next(UBreakIterator *bi)
+    int32_t ubrk_previous(UBreakIterator *bi)
+    int32_t ubrk_first(UBreakIterator *bi)
+    int32_t ubrk_last(UBreakIterator *bi)
+    int32_t ubrk_preceding(UBreakIterator *bi, int32_t offset)
+    int32_t ubrk_following(UBreakIterator *bi, int32_t offset)
+
+
+cdef extern from "Python.h":
 
     int PyUnicode_Check(ob)
     int PyString_Check(ob)
@@ -197,3 +230,47 @@ cdef class Collator:
             (<UCharString>u2).data,
             (<UCharString>u2).length,
             )
+
+cdef class WordBreaker:
+    cdef UBreakIterator *breaker
+    cdef readonly object locale
+
+    def __cinit__(self, locale):
+        cdef UBreakIterator *breaker
+        cdef UErrorCode status
+        cdef char *clocale
+        status = U_ZERO_ERROR
+        clocale = PyString_AS_STRING(locale)
+        breaker = ubrk_open(UBRK_WORD, clocale, NULL, 0, &status)
+        if U_FAILURE(status):
+            raise ValueError("Couldn't create a breaker")
+        if ((status == U_USING_DEFAULT_WARNING
+             or status == U_USING_FALLBACK_WARNING)
+            and locale != "root" and locale != "C"):
+            raise ValueError("Invalid locale %s" % locale)
+        self.breaker = breaker
+        self.locale = locale
+
+    def words(self, string):
+        cdef UErrorCode status
+        cdef UCharString uni
+        status = U_ZERO_ERROR
+        uni = UCharString(string)
+        ubrk_setText(self.breaker,
+                     (<UCharString>uni).data,
+                     (<UCharString>uni).length, &status)
+        if U_FAILURE(status):
+            raise ValueError("Couldn't set text to %s: %d" % (string, status))
+        p = ubrk_first(self.breaker)
+        words = []
+        while p != UBRK_DONE:
+            n = ubrk_next(self.breaker)
+            if p != n and n != UBRK_DONE:
+                words.append(string[p:n])
+            p = n
+        return words
+
+    def __dealloc__(self):
+        if self.breaker != NULL:
+            ubrk_close(self.breaker)
+
diff --git a/collate/syslocale.py b/collate/syslocale.py
index 1ee2924..e2aeed9 100644
--- a/collate/syslocale.py
+++ b/collate/syslocale.py
@@ -12,11 +12,10 @@ locale of all previous collators and anything else using the system
 locale information.
 
 Use this collation backend if...
- - You are on a system without ICU or UCA datafiles for the locale,
-   and DUCET results are not acceptable.
+ - You are on a system without ICU.
 
 Avoid this backend if...
- - ICU or UCA support is available for the current locale.
+ - ICU is available for the current locale.
  - You are sorting strings from alphabets outside the primary locale.
  - You need to support collating multiple locales at once.
  - You need the same results across multiple platforms.
@@ -24,6 +23,7 @@ Avoid this backend if...
 """
 
 import locale
+import re
 
 import collate.errors
 import collate._abcollator
@@ -66,3 +66,10 @@ class Collator(collate._abcollator.Collator):
             b = b.decode(self.encoding, "replace")
         return locale.strcoll(a, b)
                                   
+    def words(self, string, sep=re.compile(r"\W+", re.UNICODE)):
+        """Split the string into separate words.
+
+        This split is done using the locale's notion of a word boundry.
+        """
+        return re.split(sep, string)
+
diff --git a/setup.py b/setup.py
index cf14037..111cfac 100755
--- a/setup.py
+++ b/setup.py
@@ -17,9 +17,9 @@ setup(name='collate',
       description="Python text collation",
       license="MIT / ZPL 2.1",
       ext_modules=[
-          Extension('collate.icu._ucol',
-                    ['collate/icu/_ucol.pyx'],
+          Extension('collate.icu._icu',
+                    ['collate/icu/_icu.pyx'],
                     libraries=libraries)],
       cmdclass=dict(build_ext=build_ext),
-      packages=["collate", "collate.icu", "collate.uca", "collate.syslocale"],
+      packages=["collate", "collate.icu"],
       )
-- 
2.20.1