More tweaks; notably try to insert paragraph breaks rather than a separate Python...
authorJoe Wreschnig <joe.wreschnig@gmail.com>
Fri, 19 Feb 2010 09:23:57 +0000 (01:23 -0800)
committerJoe Wreschnig <joe.wreschnig@gmail.com>
Fri, 19 Feb 2010 09:23:57 +0000 (01:23 -0800)
collate/_abcollator.py
collate/_strings.py [deleted file]
collate/icu/__init__.py
collate/strings.py [new file with mode: 0644]

index fdd7783..0ae5d45 100644 (file)
@@ -1,14 +1,28 @@
-import collate._strings
+import collate.strings
 
 class Collator(object):
     def cmp(self, string1, string2):
         """Return negative if a < b, zero if a == b, positive if a > b."""
         return cmp(self.key(string1), self.key(string2))
 
+    def words(self, string):
+        """Split the string along word boundries."""
+        if isinstance(string, str):
+            string = string.decode(self.encoding, 'replace')
+        return string.split()
+
     def sortemekey(self, string, invalid=float('inf')):
+        """Return a key based on sortemes of a string.
+
+        If the string is a str instance, it is decoded to a unicode
+        instance according to the 'encoding' attribute of the
+        Collator.
+        """
         keys = []
-        for sorteme in collate._strings.sortemes(string):
-            num, alpha = collate._strings.numeric(sorteme, invalid)
+        if isinstance(string, str):
+            string = string.decode(self.encoding, 'replace')
+        for sorteme in collate.strings.sortemes(string):
+            num, alpha = collate.strings.numeric(sorteme, invalid)
             if num == invalid:
                 keys.append(self.key(alpha))
             else:
diff --git a/collate/_strings.py b/collate/_strings.py
deleted file mode 100644 (file)
index aed2ba7..0000000
+++ /dev/null
@@ -1,227 +0,0 @@
-import unicodedata
-
-CONTINUE_ON = frozenset([
-    "Ll", "Lm", "Lo", "Lt", "Lu",
-    "Mc", "Me", "Mn",
-    "Nd", "Nl", "No",
-    "Po",
-    "Zs",
-    ])
-
-UNKNOWN, LETTER, NUMBER = range(3)
-
-def sortemes(string):
-    """Generate a list of sortemes for the string.
-
-    A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
-    sort information. This is larger than a word boundry but smaller
-    than a sentence boundry; roughly, a sorteme boundry occurs between
-    letters and numbers, between numbers and numbrs if 'too much'
-    punctuation exists in between, between lines.
-
-    There is no formal specification for sortemes; the goal of this
-    function is to provide good output for Collator.sortemekey.
-    """
-
-    words = []
-    if not string:
-        return words
-    string = unicode(string)
-    start = None
-    last = None
-    mode = UNKNOWN
-    previous_mode = UNKNOWN
-    category = "XX"
-    for i, c in enumerate(string):
-        broke = False
-        prev_category = category
-        this_mode = mode
-        category = unicodedata.category(c)
-
-        # Split at the first letter following a number or
-        # non-continuing character.
-        if category[0] == "L":
-            if mode != LETTER:
-                broke = True
-                mode = LETTER
-
-        # Split at the first number following a non-number or
-        # non-continuing character.
-        elif category[0] == "N":
-            if mode != NUMBER:
-                broke = True
-                mode = NUMBER
-
-        # Split if we find a non-continuing character ("weird" ones).
-        elif category not in CONTINUE_ON:
-            broke = True
-            mode = UNKNOWN
-
-        # Only certain punctuation allowed in numbers.
-        elif mode == NUMBER and category[0] == "P" and c not in "',._":
-            broke = True
-            mode = UNKNOWN
-
-        # Split if we find two pieces of punctuation in a row, even
-        # if we should otherwise continue.
-        elif i > 0 and prev_category[0] == "P" and category[0] == "P":
-            broke = True
-            mode = UNKNOWN
-
-        if broke and start is not None and last is not None:
-            # If we read two strings separated by weird punctuation,
-            # pretend the punctuation isn't there.
-            if (this_mode == previous_mode == LETTER
-                and (category[0] == "P" or prev_category[0] == "P")
-                and words):
-                words[-1] += u" " + string[start:last+1]
-            else:
-                # This ensures "foo2 bar" sorts as ["foo ", 2, "bar"]
-                # Which sorts after ["foo", "bar"].
-                if this_mode == NUMBER and previous_mode == LETTER and words:
-                    words[-1] += u" "
-                words.append(string[start:last+1])
-                previous_mode = this_mode
-
-        if broke:
-            start = i
-            last = None
-        if category[0] in "LN":
-            last = i
-    this_mode = mode
-    if start is not None and last is not None:
-        if this_mode == LETTER and previous_mode == LETTER and words:
-            words[-1] += u" " + string[start:last+1]
-        else:
-            if this_mode == NUMBER and previous_mode == LETTER and words:
-                words[-1] += u" "
-            words.append(string[start:last+1])
-    return words
-
-def numeric(orig, invalid=float('inf')):
-    if not orig:
-        return (invalid, '')
-
-    string = unicode(orig)
-    for c in string:
-        if c.isnumeric():
-            break
-    else:
-        return (invalid, orig)
-
-    mult = 1
-    while string[:1] == u"-" or string[:1] == u"+":
-        if string[:1] == u"-":
-            mult = -mult
-        string = string[1:]
-
-    if not string[:1].isnumeric():
-        return (invalid, orig)
-
-    string = normalize_punc(string)
-
-    # Early out if possible.
-    try:
-        return (float(string) * mult, orig)
-    except ValueError:
-        pass
-
-    # Otherwise we need to do this the hard way.
-    def _numeric(string):
-        total = 0
-        for c in string:
-            v = unicodedata.numeric(c)
-            if v >= 1 or v == 0:
-                total *= 10
-            total += v
-        return total
-
-    try:
-        whole, frac = string.split(".")
-        whole = _numeric(whole)
-        frac = _numeric(frac) / (10.0 ** len(frac))
-        return (mult * (whole + frac), orig)
-    except ValueError:
-        return (mult * _numeric(string), orig)
-
-def normalize_punc(string):
-    string = unicode(string.strip(u",.'"))
-    string = filter(lambda u: u.isnumeric() or u in u",.'", string)
-    commas = string.count(u",")
-    stops = string.count(u".")
-    quotes = string.count(u"'")
-
-    # If anything occurs more than once, it's a separator.
-    if commas > 1:
-        string = string.replace(u",", u"")
-        commas = 0
-    if stops > 1:
-        string = string.replace(u".", u"")
-        stops = 0
-    if quotes > 1:
-        string = string.replace(u"'", u"")
-        quotes = 0
-
-    def normalize_two(a, b, string):
-        # One of each - assume the first is grouping, second is point.
-        a_idx = string.rindex(a)
-        b_idx = string.rindex(b)
-        if a_idx > b_idx:
-            string = string.replace(b, u"").replace(a, u".")
-        else:
-            string = string.replace(a, u"").replace(b, u".")
-        return string
-
-    if commas and stops and quotes:
-        # If all three, assume the middle is the decimal point.
-        # A,AAA.BB'CC
-        # A.AAA,BB'CC
-        # A,AAA'BB.CC
-        # A.AAA'BB,CC
-        # Not really valid, so do whatever we want...
-        # A'AAA.BB,CC
-        # A'AAA,BB.CC
-        comma_idx = string.index(u",")
-        stops_idx = string.index(u".")
-        quotes_idx = string.index(u"'")
-        if (comma_idx < stops_idx < quotes_idx
-            or quotes_idx < stops_idx < comma_idx):
-            string = string.replace(u",", u"").replace(u"'", u"")
-        elif (comma_idx < quotes_idx < stops_idx
-            or stops_idx < quotes_idx < comma_idx):
-            string = string.replace(
-                u",", u"").replace(
-                u".", u"").replace(
-                u"'", u".")
-        else:
-            string = string.replace(
-                u"'", u"").replace(
-                u".", u"").replace(
-                u",", u".")
-
-    elif stops and quotes:
-        string = normalize_two(u".", u"'", string)
-
-    elif commas and quotes:
-        string = normalize_two(u",", u"'", string)
-
-    elif commas and stops:
-        string = normalize_two(u",", u".", string)
-
-    elif commas:
-        if string[-4:-3] == u"," and len(string) <= 7:
-            # Single comma as a thousands separator.
-            string = string.replace(u",", u"")
-        else:
-            # Single comma, not thousands - probably a decimal point.
-            string = string.replace(u",", u".")
-
-    elif quotes:
-        # Single quote, probably MM'SS", equivalent to a decimal point.
-        string = string.replace(u"'", u".")
-
-    elif stops and string[-4:] == ".000":
-        # Single stop, but no decimal - probably grouping.
-        string = string.replace(u".", u"")
-
-    return string
index 892b8a1..5f3ec05 100644 (file)
@@ -36,6 +36,13 @@ class Collator(collate._abcollator.Collator):
             # so this is a harmless error.
             self._breaker = _icu.WordBreaker("root")
 
+    def words(self, string):
+        """Split the string along word boundries."""
+        if isinstance(string, str):
+            string = string.decode(self.encoding)
+        words = self._breaker.words(string)
+        return [w for w in words if not w.isspace()]
+
     def key(self, string):
         """Sort key for a string.
 
diff --git a/collate/strings.py b/collate/strings.py
new file mode 100644 (file)
index 0000000..bc4ed62
--- /dev/null
@@ -0,0 +1,231 @@
+import unicodedata
+
+CONTINUE_ON = frozenset([
+    "Ll", "Lm", "Lo", "Lt", "Lu",
+    "Mc", "Me", "Mn",
+    "Nd", "Nl", "No",
+    "Po",
+    "Zs",
+    ])
+
+UNKNOWN, LETTER, NUMBER = range(3)
+
+BREAKER = u"\u2029"
+
+def sortemes(string):
+    """Generate a list of sortemes for the string.
+
+    A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
+    sort information. This is larger than a word boundry but smaller
+    than a sentence boundry; roughly, a sorteme boundry occurs between
+    letters and numbers, between numbers and numbrs if 'too much'
+    punctuation exists in between, between lines.
+
+    There is no formal specification for sortemes; the goal of this
+    function is to provide good output for Collator.sortemekey.
+    """
+
+    words = []
+    if not string:
+        return words
+    string = unicode(string)
+    start = None
+    last = None
+    mode = UNKNOWN
+    previous_mode = UNKNOWN
+    category = "XX"
+
+    # TODO(jfw): This kind of evolved over time, there's probably a much
+    # faster / more concise way to express it now.
+    for i, c in enumerate(string):
+        broke = False
+        prev_category = category
+        this_mode = mode
+        category = unicodedata.category(c)
+
+        # Split at the first letter following a number or
+        # non-continuing character.
+        if category[0] == "L":
+            if mode != LETTER:
+                broke = True
+                mode = LETTER
+
+        # Split at the first number following a non-number or
+        # non-continuing character.
+        elif category[0] == "N":
+            if mode != NUMBER:
+                broke = True
+                mode = NUMBER
+
+        # Split if we find a non-continuing character ("weird" ones).
+        elif category not in CONTINUE_ON:
+            broke = True
+            mode = UNKNOWN
+
+        # Only certain punctuation allowed in numbers.
+        elif mode == NUMBER and category[0] == "P" and c not in "',._":
+            broke = True
+            mode = UNKNOWN
+
+        # Split if we find two pieces of punctuation in a row, even
+        # if we should otherwise continue.
+        elif i > 0 and prev_category[0] == "P" and category[0] == "P":
+            broke = True
+            mode = UNKNOWN
+
+        if broke and start is not None and last is not None:
+            # If we read two strings separated by weird punctuation,
+            # pretend the punctuation isn't there.
+            if (this_mode == previous_mode == LETTER
+                and words):
+                words[-1] += BREAKER + string[start:last+1]
+            else:
+                # This ensures "foo2 bar" sorts as ["foo ", 2, "bar"]
+                # Which sorts after ["foo", "bar"].
+                if this_mode == NUMBER and previous_mode == LETTER and words:
+                    words[-1] += BREAKER
+                words.append(string[start:last+1])
+                previous_mode = this_mode
+
+        if broke:
+            start = i
+            last = None
+        if category[0] in "LN":
+            last = i
+    this_mode = mode
+    if start is not None and last is not None:
+        if this_mode == LETTER and previous_mode == LETTER and words:
+            words[-1] += BREAKER + string[start:last+1]
+        else:
+            if this_mode == NUMBER and previous_mode == LETTER and words:
+                words[-1] += BREAKER
+            words.append(string[start:last+1])
+    return words
+
+def numeric(orig, invalid=float('inf')):
+    if not orig:
+        return (invalid, '')
+
+    string = unicode(orig)
+    for c in string:
+        if c.isnumeric():
+            break
+    else:
+        return (invalid, orig)
+
+    mult = 1
+    while string[:1] == u"-" or string[:1] == u"+":
+        if string[:1] == u"-":
+            mult = -mult
+        string = string[1:]
+
+    if not string[:1].isnumeric():
+        return (invalid, orig)
+
+    string = normalize_punc(string)
+
+    # Early out if possible.
+    try:
+        return (float(string) * mult, orig)
+    except ValueError:
+        pass
+
+    # Otherwise we need to do this the hard way.
+    def _numeric(string):
+        total = 0
+        for c in string:
+            v = unicodedata.numeric(c)
+            if v >= 1 or v == 0:
+                total *= 10
+            total += v
+        return total
+
+    try:
+        whole, frac = string.split(".")
+        whole = _numeric(whole)
+        frac = _numeric(frac) / (10.0 ** len(frac))
+        return (mult * (whole + frac), orig)
+    except ValueError:
+        return (mult * _numeric(string), orig)
+
+def normalize_punc(string):
+    string = unicode(string.strip(u",.'"))
+    string = filter(lambda u: u.isnumeric() or u in u",.'", string)
+    commas = string.count(u",")
+    stops = string.count(u".")
+    quotes = string.count(u"'")
+
+    # If anything occurs more than once, it's a separator.
+    if commas > 1:
+        string = string.replace(u",", u"")
+        commas = 0
+    if stops > 1:
+        string = string.replace(u".", u"")
+        stops = 0
+    if quotes > 1:
+        string = string.replace(u"'", u"")
+        quotes = 0
+
+    def normalize_two(a, b, string):
+        # One of each - assume the first is grouping, second is point.
+        a_idx = string.rindex(a)
+        b_idx = string.rindex(b)
+        if a_idx > b_idx:
+            string = string.replace(b, u"").replace(a, u".")
+        else:
+            string = string.replace(a, u"").replace(b, u".")
+        return string
+
+    if commas and stops and quotes:
+        # If all three, assume the middle is the decimal point.
+        # A,AAA.BB'CC
+        # A.AAA,BB'CC
+        # A,AAA'BB.CC
+        # A.AAA'BB,CC
+        # Not really valid, so do whatever we want...
+        # A'AAA.BB,CC
+        # A'AAA,BB.CC
+        comma_idx = string.index(u",")
+        stops_idx = string.index(u".")
+        quotes_idx = string.index(u"'")
+        if (comma_idx < stops_idx < quotes_idx
+            or quotes_idx < stops_idx < comma_idx):
+            string = string.replace(u",", u"").replace(u"'", u"")
+        elif (comma_idx < quotes_idx < stops_idx
+            or stops_idx < quotes_idx < comma_idx):
+            string = string.replace(
+                u",", u"").replace(
+                u".", u"").replace(
+                u"'", u".")
+        else:
+            string = string.replace(
+                u"'", u"").replace(
+                u".", u"").replace(
+                u",", u".")
+
+    elif stops and quotes:
+        string = normalize_two(u".", u"'", string)
+
+    elif commas and quotes:
+        string = normalize_two(u",", u"'", string)
+
+    elif commas and stops:
+        string = normalize_two(u",", u".", string)
+
+    elif commas:
+        if string[-4:-3] == u"," and len(string) <= 7:
+            # Single comma as a thousands separator.
+            string = string.replace(u",", u"")
+        else:
+            # Single comma, not thousands - probably a decimal point.
+            string = string.replace(u",", u".")
+
+    elif quotes:
+        # Single quote, probably MM'SS", equivalent to a decimal point.
+        string = string.replace(u"'", u".")
+
+    elif stops and string[-4:] == ".000":
+        # Single stop, but no decimal - probably grouping.
+        string = string.replace(u".", u"")
+
+    return string