collate/strings.py

   1 """String utility functions for collation."""
   2
   3 __all__ = ["sortemes", "numeric", "normalize_number", "deroman"]
   4
   5 import unicodedata
   6
   7 CONTINUE_ON = frozenset([
   8     "Ll", "Lm", "Lo", "Lt", "Lu",
   9     "Mc", "Me", "Mn",
  10     "Nd", "Nl", "No",
  11     "Po",
  12     "Zs",
  13     ])
  14
  15 UNKNOWN, LETTER, NUMBER = range(3)
  16
  17 BREAKER = u"\u2028" # Line break character
  18 HBREAKER = u"\u2029" # Paragraph break character
  19 INFINITY = float('inf')
  20
  21 KEEP_IN_NUMBERS = u"'.,"
  22 ALLOWED_IN_NUMBERS = KEEP_IN_NUMBERS + u"_"
  23
  24 ROMAN = {
  25     u"i": 1,
  26     u"v": 5,
  27     u"x": 10,
  28     u"l": 50,
  29     u"c": 100,
  30     u"d": 500,
  31     u"m": 1000,
  32     u"\u2180": 1000,
  33     u"\u2181": 5000,
  34     u"\u2182": 10000,
  35     u"\u2183": 100,
  36     u"\u2184": 100,
  37     u"\u2185": 6,
  38     u"\u2186": 50,
  39     u"\u2187": 50000,
  40     u"\u2188": 100000,
  41     }
  42
  43 def stripends(word):
  44     """Strip punctuation and symbols from the ends of a string."""
  45     while word and unicodedata.category(word[0])[0] in "PS":
  46         word = word[1:]
  47     while word and unicodedata.category(word[-1])[0] in "PS":
  48         word = word[:-1]
  49     return word
  50
  51 def sortemes(string, key=lambda s: s):
  52     """Generate a list of sortemes for the string.
  53
  54     A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
  55     sort information. This is larger than a word boundry but smaller
  56     than a sentence boundry; roughly, a sorteme boundry occurs between
  57     letters and numbers, between numbers and numbers if 'too much'
  58     punctuation exists in between, between lines.
  59
  60     There is no formal specification for sortemes; the goal of this
  61     function is to provide good output for Collator.sortemekey.
  62
  63     """
  64
  65     if not string:
  66         return []
  67
  68     words = []
  69     letters = []
  70     digits = []
  71     lappend = letters.append
  72     dappend = digits.append
  73     string = unicode(string)
  74     categories = map(unicodedata.category, string)
  75     previous = UNKNOWN
  76     wappend = words.append
  77     join = u"".join
  78     i = 0
  79
  80     for uchar in string:
  81         category = categories[i]
  82
  83         if letters and previous == LETTER and words:
  84             word = stripends(words.pop()[1].strip()) + BREAKER
  85             letters.insert(0, word)
  86             previous = UNKNOWN
  87
  88         # Split at the first letter following a number or
  89         # non-continuing character.
  90         if category[0] == "L":
  91             lappend(uchar)
  92             if digits:
  93                 words.append((numeric(join(digits).strip()), u''))
  94                 del(digits[:])
  95                 previous = NUMBER
  96
  97         # Split at the first number following a non-number or
  98         # non-continuing character.
  99         elif category[0] == "N":
 100             dappend(uchar)
 101             if letters:
 102                 if unicodedata.category(letters[-1])[0] == "L":
 103                     lappend(HBREAKER)
 104                 wappend((INFINITY, stripends(join(letters))))
 105                 del(letters[:])
 106                 previous = LETTER
 107
 108         # Only certain punctuation allowed in numbers.
 109         elif digits and uchar not in ALLOWED_IN_NUMBERS:
 110             words.append((numeric(join(digits)), u''))
 111             del(digits[:])
 112             previous = NUMBER
 113
 114         # Split if we find a non-continuing character ("weird" ones).
 115         elif category not in CONTINUE_ON:
 116             if letters:
 117                 wappend(
 118                     (INFINITY,
 119                      stripends(join(letters).strip() + BREAKER)))
 120                 del(letters[:])
 121                 previous = LETTER
 122             if digits:
 123                 words.append((numeric(join(digits)), u''))
 124                 del(digits[:])
 125                 previous = NUMBER
 126
 127         # Split if we find two pieces of punctuation in a row, even
 128         # if we should otherwise continue.
 129         elif i and categories[i - 1][0] == category[0] == "P":
 130             if letters:
 131                 wappend((INFINITY, stripends(join(letters))))
 132                 del(letters[:])
 133                 previous = LETTER
 134             if digits:
 135                 words.append((numeric(join(digits)), u''))
 136                 del(digits[:])
 137                 previous = NUMBER
 138
 139         else:
 140             if digits:
 141                 dappend(uchar)
 142             elif letters:
 143                 lappend(uchar)
 144
 145         i += 1
 146
 147     if letters and previous == LETTER and words:
 148         word = stripends(words.pop()[1].strip()) + BREAKER
 149         letters.insert(0, word)
 150         previous = UNKNOWN
 151
 152     if letters:
 153         wappend((INFINITY, stripends(join(letters))))
 154     if digits:
 155         words.append((numeric(join(digits)), u''))
 156
 157     return [(i, key(w)) for i, w in words]
 158
 159 def numeric(orig, invalid=INFINITY):
 160     """Parse a number out of a string.
 161
 162     This function parses a unicode number out of the start of a
 163     string. If a number cannot be found at the start, the 'invalid'
 164     argument is returned.
 165
 166     """
 167
 168     if not orig:
 169         return invalid
 170
 171     string = unicode(orig)
 172     for uchar in string:
 173         if uchar.isnumeric():
 174             break
 175     else:
 176         return invalid
 177
 178     for char in string:
 179         if u"\u2160" <= char <= u"\u2188":
 180             return deroman(string)
 181
 182     mult = 1
 183     while string[:1] == u"-" or string[:1] == u"+":
 184         if string[:1] == u"-":
 185             mult = -mult
 186         string = string[1:]
 187
 188     if not string[:1].isnumeric():
 189         return invalid
 190
 191     string = normalize_number(string)
 192
 193     def _numeric(string):
 194         """Interpreter a number as base 10."""
 195         total = 0
 196         for uchar in string:
 197             number = unicodedata.numeric(uchar)
 198             if number >= 1 or number == 0:
 199                 total *= 10
 200             total += number
 201         return total
 202
 203     try:
 204         whole, frac = string.split(".")
 205         whole = _numeric(whole)
 206         frac = _numeric(frac) / (10.0 ** len(frac))
 207         return mult * (whole + frac)
 208     except ValueError:
 209         return mult * _numeric(string)
 210
 211 def normalize_number(string):
 212     """Normalize punctuation in a number.
 213
 214     This function attempts to guess which characters in a number
 215     represent grouping separators and which represent decimal
 216     points. It returns a string that is valid to pass to Python's
 217     float() routine (potentially, NaN, if nothing like a number is
 218     found).
 219
 220     """
 221
 222     string = unicode(string)
 223     string = filter(lambda u: u.isnumeric() or u in KEEP_IN_NUMBERS, string)
 224     string = string.strip(KEEP_IN_NUMBERS)
 225
 226     commas = string.count(u",")
 227     stops = string.count(u".")
 228     quotes = string.count(u"'")
 229
 230     # If anything occurs more than once, it's a separator.
 231     if commas > 1:
 232         string = string.replace(u",", u"")
 233         commas = 0
 234     if stops > 1:
 235         string = string.replace(u".", u"")
 236         stops = 0
 237     if quotes > 1:
 238         string = string.replace(u"'", u"")
 239         quotes = 0
 240
 241     def normalize_two(a, b, string):
 242         """One of each - assume the first is grouping, second is point."""
 243         a_idx = string.rindex(a)
 244         b_idx = string.rindex(b)
 245         if a_idx > b_idx:
 246             string = string.replace(b, u"").replace(a, u".")
 247         else:
 248             string = string.replace(a, u"").replace(b, u".")
 249         return string
 250
 251     if commas and stops and quotes:
 252         # If all three, assume the middle is the decimal point.
 253         # A,AAA.BB'CC
 254         # A.AAA,BB'CC
 255         # A,AAA'BB.CC
 256         # A.AAA'BB,CC
 257         # Not really valid, so do whatever we want...
 258         # A'AAA.BB,CC
 259         # A'AAA,BB.CC
 260         comma_idx = string.index(u",")
 261         stops_idx = string.index(u".")
 262         quotes_idx = string.index(u"'")
 263         if (comma_idx < stops_idx < quotes_idx
 264             or quotes_idx < stops_idx < comma_idx):
 265             string = string.replace(u",", u"").replace(u"'", u"")
 266         elif (comma_idx < quotes_idx < stops_idx
 267             or stops_idx < quotes_idx < comma_idx):
 268             string = string.replace(
 269                 u",", u"").replace(
 270                 u".", u"").replace(
 271                 u"'", u".")
 272         else:
 273             string = string.replace(
 274                 u"'", u"").replace(
 275                 u".", u"").replace(
 276                 u",", u".")
 277
 278     elif stops and quotes:
 279         string = normalize_two(u".", u"'", string)
 280
 281     elif commas and quotes:
 282         string = normalize_two(u",", u"'", string)
 283
 284     elif commas and stops:
 285         string = normalize_two(u",", u".", string)
 286
 287     elif commas:
 288         if string[-4:-3] == u"," and len(string) <= 7:
 289             # Single comma as a thousands separator.
 290             string = string.replace(u",", u"")
 291         else:
 292             # Single comma, not thousands - probably a decimal point.
 293             string = string.replace(u",", u".")
 294
 295     elif quotes:
 296         # Single quote, probably MM'SS", equivalent to a decimal point.
 297         string = string.replace(u"'", u".")
 298
 299     elif stops and string[-4:] == ".000":
 300         # Single stop, but no decimal - probably grouping.
 301         string = string.replace(u".", u"")
 302
 303     return string or "NaN"
 304
 305 def deroman(string):
 306     """Turn a Roman numeral into an integer."""
 307     string = unicodedata.normalize('NFKD', unicode(string)).lower()
 308     previous = 0
 309     building = 0
 310     for char in reversed(string):
 311         try:
 312             value = ROMAN[char]
 313         except KeyError:
 314             continue
 315         if value < previous:
 316             building -= value
 317         else:
 318             building += value
 319         previous = value
 320     return building