X-Git-Url: https://git.korewanetadesu.com/?p=python-collate.git;a=blobdiff_plain;f=collate%2Fstrings.py;h=487257ffe6e8a02dbdde8b9be669a25d1a976ba5;hp=bc4ed629aec5a9b8be46d2da1e9779eca9eaa090;hb=a3bd28edb9b44a1531af3ef8c4ae9cd6a4e2d3b3;hpb=9a7cf6459c40d53b58634f2df56386bf52c12f7c

diff --git a/collate/strings.py b/collate/strings.py
index bc4ed62..487257f 100644
--- a/collate/strings.py
+++ b/collate/strings.py
@@ -1,3 +1,7 @@
+"""String utility functions for collation."""
+
+__all__ = ["sortemes", "numeric", "normalize_number", "deroman"]
+
 import unicodedata
 
 CONTINUE_ON = frozenset([
@@ -10,108 +14,170 @@ CONTINUE_ON = frozenset([
 
 UNKNOWN, LETTER, NUMBER = range(3)
 
-BREAKER = u"\u2029"
-
-def sortemes(string):
+BREAKER = u"\u2028" # Line break character
+HBREAKER = u"\u2029" # Paragraph break character
+INFINITY = float('inf')
+
+KEEP_IN_NUMBERS = u"'.,"
+ALLOWED_IN_NUMBERS = KEEP_IN_NUMBERS + u"_"
+
+ROMAN = {
+    u"i": 1,
+    u"v": 5,
+    u"x": 10,
+    u"l": 50,
+    u"c": 100,
+    u"d": 500,
+    u"m": 1000,
+    u"\u2180": 1000,
+    u"\u2181": 5000,
+    u"\u2182": 10000,
+    u"\u2183": 100,
+    u"\u2184": 100,
+    u"\u2185": 6,
+    u"\u2186": 50,
+    u"\u2187": 50000,
+    u"\u2188": 100000,
+    }
+
+def stripends(word):
+    """Strip punctuation and symbols from the ends of a string."""
+    while word and unicodedata.category(word[0])[0] in "PS":
+        word = word[1:]
+    while word and unicodedata.category(word[-1])[0] in "PS":
+        word = word[:-1]
+    return word
+
+def sortemes(string, key=lambda s: s):
     """Generate a list of sortemes for the string.
 
     A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
     sort information. This is larger than a word boundry but smaller
     than a sentence boundry; roughly, a sorteme boundry occurs between
-    letters and numbers, between numbers and numbrs if 'too much'
+    letters and numbers, between numbers and numbers if 'too much'
     punctuation exists in between, between lines.
 
     There is no formal specification for sortemes; the goal of this
     function is to provide good output for Collator.sortemekey.
+
     """
 
-    words = []
     if not string:
-        return words
+        return []
+
+    words = []
+    letters = []
+    digits = []
+    lappend = letters.append
+    dappend = digits.append
     string = unicode(string)
-    start = None
-    last = None
-    mode = UNKNOWN
-    previous_mode = UNKNOWN
-    category = "XX"
-
-    # TODO(jfw): This kind of evolved over time, there's probably a much
-    # faster / more concise way to express it now.
-    for i, c in enumerate(string):
-        broke = False
-        prev_category = category
-        this_mode = mode
-        category = unicodedata.category(c)
+    categories = map(unicodedata.category, string)
+    previous = UNKNOWN
+    wappend = words.append
+    join = u"".join
+    i = 0
+
+    for uchar in string:
+        category = categories[i]
+
+        if letters and previous == LETTER and words:
+            word = stripends(words.pop()[1].strip()) + BREAKER
+            letters.insert(0, word)
+            previous = UNKNOWN
 
         # Split at the first letter following a number or
         # non-continuing character.
         if category[0] == "L":
-            if mode != LETTER:
-                broke = True
-                mode = LETTER
+            lappend(uchar)
+            if digits:
+                words.append((numeric(join(digits).strip()), u''))
+                del(digits[:])
+                previous = NUMBER
 
         # Split at the first number following a non-number or
         # non-continuing character.
         elif category[0] == "N":
-            if mode != NUMBER:
-                broke = True
-                mode = NUMBER
+            dappend(uchar)
+            if letters:
+                if unicodedata.category(letters[-1])[0] == "L":
+                    lappend(HBREAKER)
+                wappend((INFINITY, stripends(join(letters))))
+                del(letters[:])
+                previous = LETTER
+
+        # Only certain punctuation allowed in numbers.
+        elif digits and uchar not in ALLOWED_IN_NUMBERS:
+            words.append((numeric(join(digits)), u''))
+            del(digits[:])
+            previous = NUMBER
 
         # Split if we find a non-continuing character ("weird" ones).
         elif category not in CONTINUE_ON:
-            broke = True
-            mode = UNKNOWN
-
-        # Only certain punctuation allowed in numbers.
-        elif mode == NUMBER and category[0] == "P" and c not in "',._":
-            broke = True
-            mode = UNKNOWN
+            if letters:
+                wappend(
+                    (INFINITY,
+                     stripends(join(letters).strip() + BREAKER)))
+                del(letters[:])
+                previous = LETTER
+            if digits:
+                words.append((numeric(join(digits)), u''))
+                del(digits[:])
+                previous = NUMBER
 
         # Split if we find two pieces of punctuation in a row, even
         # if we should otherwise continue.
-        elif i > 0 and prev_category[0] == "P" and category[0] == "P":
-            broke = True
-            mode = UNKNOWN
-
-        if broke and start is not None and last is not None:
-            # If we read two strings separated by weird punctuation,
-            # pretend the punctuation isn't there.
-            if (this_mode == previous_mode == LETTER
-                and words):
-                words[-1] += BREAKER + string[start:last+1]
-            else:
-                # This ensures "foo2 bar" sorts as ["foo ", 2, "bar"]
-                # Which sorts after ["foo", "bar"].
-                if this_mode == NUMBER and previous_mode == LETTER and words:
-                    words[-1] += BREAKER
-                words.append(string[start:last+1])
-                previous_mode = this_mode
-
-        if broke:
-            start = i
-            last = None
-        if category[0] in "LN":
-            last = i
-    this_mode = mode
-    if start is not None and last is not None:
-        if this_mode == LETTER and previous_mode == LETTER and words:
-            words[-1] += BREAKER + string[start:last+1]
+        elif i and categories[i - 1][0] == category[0] == "P":
+            if letters:
+                wappend((INFINITY, stripends(join(letters))))
+                del(letters[:])
+                previous = LETTER
+            if digits:
+                words.append((numeric(join(digits)), u''))
+                del(digits[:])
+                previous = NUMBER
+
         else:
-            if this_mode == NUMBER and previous_mode == LETTER and words:
-                words[-1] += BREAKER
-            words.append(string[start:last+1])
-    return words
+            if digits:
+                dappend(uchar)
+            elif letters:
+                lappend(uchar)
+
+        i += 1
+
+    if letters and previous == LETTER and words:
+        word = stripends(words.pop()[1].strip()) + BREAKER
+        letters.insert(0, word)
+        previous = UNKNOWN
+
+    if letters:
+        wappend((INFINITY, stripends(join(letters))))
+    if digits:
+        words.append((numeric(join(digits)), u''))
+
+    return [(i, key(w)) for i, w in words]
+
+def numeric(orig, invalid=INFINITY):
+    """Parse a number out of a string.
+
+    This function parses a unicode number out of the start of a
+    string. If a number cannot be found at the start, the 'invalid'
+    argument is returned.
+        
+    """
 
-def numeric(orig, invalid=float('inf')):
     if not orig:
-        return (invalid, '')
+        return invalid
 
     string = unicode(orig)
-    for c in string:
-        if c.isnumeric():
+    for uchar in string:
+        if uchar.isnumeric():
             break
     else:
-        return (invalid, orig)
+        return invalid
+
+    for char in string:
+        if u"\u2160" <= char <= u"\u2188":
+            return deroman(string)
 
     mult = 1
     while string[:1] == u"-" or string[:1] == u"+":
@@ -120,37 +186,43 @@ def numeric(orig, invalid=float('inf')):
         string = string[1:]
 
     if not string[:1].isnumeric():
-        return (invalid, orig)
+        return invalid
 
-    string = normalize_punc(string)
-
-    # Early out if possible.
-    try:
-        return (float(string) * mult, orig)
-    except ValueError:
-        pass
+    string = normalize_number(string)
 
-    # Otherwise we need to do this the hard way.
     def _numeric(string):
+        """Interpreter a number as base 10."""
         total = 0
-        for c in string:
-            v = unicodedata.numeric(c)
-            if v >= 1 or v == 0:
+        for uchar in string:
+            number = unicodedata.numeric(uchar)
+            if number >= 1 or number == 0:
                 total *= 10
-            total += v
+            total += number
         return total
 
     try:
         whole, frac = string.split(".")
         whole = _numeric(whole)
         frac = _numeric(frac) / (10.0 ** len(frac))
-        return (mult * (whole + frac), orig)
+        return mult * (whole + frac)
     except ValueError:
-        return (mult * _numeric(string), orig)
+        return mult * _numeric(string)
+
+def normalize_number(string):
+    """Normalize punctuation in a number.
+
+    This function attempts to guess which characters in a number
+    represent grouping separators and which represent decimal
+    points. It returns a string that is valid to pass to Python's
+    float() routine (potentially, NaN, if nothing like a number is
+    found).
+
+    """
+
+    string = unicode(string)
+    string = filter(lambda u: u.isnumeric() or u in KEEP_IN_NUMBERS, string)
+    string = string.strip(KEEP_IN_NUMBERS)
 
-def normalize_punc(string):
-    string = unicode(string.strip(u",.'"))
-    string = filter(lambda u: u.isnumeric() or u in u",.'", string)
     commas = string.count(u",")
     stops = string.count(u".")
     quotes = string.count(u"'")
@@ -167,7 +239,7 @@ def normalize_punc(string):
         quotes = 0
 
     def normalize_two(a, b, string):
-        # One of each - assume the first is grouping, second is point.
+        """One of each - assume the first is grouping, second is point."""
         a_idx = string.rindex(a)
         b_idx = string.rindex(b)
         if a_idx > b_idx:
@@ -228,4 +300,21 @@ def normalize_punc(string):
         # Single stop, but no decimal - probably grouping.
         string = string.replace(u".", u"")
 
-    return string
+    return string or "NaN"
+
+def deroman(string):
+    """Turn a Roman numeral into an integer."""
+    string = unicodedata.normalize('NFKD', unicode(string)).lower()
+    previous = 0
+    building = 0
+    for char in reversed(string):
+        try:
+            value = ROMAN[char]
+        except KeyError:
+            continue
+        if value < previous:
+            building -= value
+        else:
+            building += value
+        previous = value
+    return building