collate/strings.py

   1 """String utility functions for collation."""
   2
   3 __all__ = ["sortemes", "numeric", "normalize_number"]
   4
   5 import unicodedata
   6
   7 CONTINUE_ON = frozenset([
   8     "Ll", "Lm", "Lo", "Lt", "Lu",
   9     "Mc", "Me", "Mn",
  10     "Nd", "Nl", "No",
  11     "Po",
  12     "Zs",
  13     ])
  14
  15 UNKNOWN, LETTER, NUMBER = range(3)
  16
  17 BREAKER = u"\u2028" # Line break character
  18 HBREAKER = u"\u2029" # Paragraph break character
  19 INFINITY = float('inf')
  20
  21 KEEP_IN_NUMBERS = u"'.,"
  22 ALLOWED_IN_NUMBERS = KEEP_IN_NUMBERS + u"_"
  23
  24 ROMAN = {
  25     u"i": 1,
  26     u"v": 5,
  27     u"x": 10,
  28     u"l": 50,
  29     u"c": 100,
  30     u"d": 500,
  31     u"m": 1000,
  32     u"\u2180": 1000,
  33     u"\u2181": 5000,
  34     u"\u2182": 10000,
  35     u"\u2183": 100,
  36     u"\u2184": 100,
  37     u"\u2185": 6,
  38     u"\u2186": 50,
  39     u"\u2187": 50000,
  40     u"\u2188": 100000,
  41     }
  42
  43 def stripends(word):
  44     """Strip punctuation and symbols from the ends of a string."""
  45     while word and unicodedata.category(word[0])[0] in "PS":
  46         word = word[1:]
  47     while word and unicodedata.category(word[-1])[0] in "PS":
  48         word = word[:-1]
  49     return word
  50
  51 def sortemes(string, key=lambda s: s):
  52     """Generate a list of sortemes for the string.
  53
  54     A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
  55     sort information. This is larger than a word boundry but smaller
  56     than a sentence boundry; roughly, a sorteme boundry occurs between
  57     letters and numbers, between numbers and numbers if 'too much'
  58     punctuation exists in between, between lines.
  59
  60     There is no formal specification for sortemes; the goal of this
  61     function is to provide good output for Collator.sortemekey.
  62
  63     """
  64
  65     words = []
  66     letters = []
  67     digits = []
  68     if not string:
  69         return words
  70     string = unicode(string)
  71     categories = map(unicodedata.category, string)
  72     previous = UNKNOWN
  73
  74     def aletters(letters):
  75         """Add a group of letters to the word list."""
  76         words.append((INFINITY, stripends(letters)))
  77     def adigits(digits):
  78         """Add a group of digits to the word list."""
  79         words.append((numeric(digits), u''))
  80
  81     # TODO(jfw): This kind of evolved over time, there's probably a much
  82     # faster / more concise way to express it now.
  83     for i, (uchar, category) in enumerate(zip(string, categories)):
  84
  85         if letters and previous == LETTER and words:
  86             word = stripends(words.pop()[1].strip()) + BREAKER
  87             letters.insert(0, word)
  88             previous = UNKNOWN
  89
  90         # Split at the first letter following a number or
  91         # non-continuing character.
  92         if category[0] == "L":
  93             letters.append(uchar)
  94             if digits:
  95                 adigits(u"".join(digits).strip())
  96                 digits = []
  97                 previous = NUMBER
  98
  99         # Split at the first number following a non-number or
 100         # non-continuing character.
 101         elif category[0] == "N":
 102             digits.append(uchar)
 103             if letters:
 104                 if unicodedata.category(letters[-1])[0] == "L":
 105                     letters.append(HBREAKER)
 106                 aletters(u"".join(letters))
 107                 letters = []
 108                 previous = LETTER
 109
 110         # Only certain punctuation allowed in numbers.
 111         elif digits and uchar not in ALLOWED_IN_NUMBERS:
 112             adigits(u"".join(digits))
 113             digits = []
 114             previous = NUMBER
 115
 116         # Split if we find a non-continuing character ("weird" ones).
 117         elif letters and category not in CONTINUE_ON:
 118             if letters:
 119                 aletters(u"".join(letters).strip() + BREAKER)
 120                 letters = []
 121                 previous = LETTER
 122             if digits:
 123                 adigits(u"".join(digits).strip())
 124                 digits = []
 125                 previous = NUMBER
 126
 127         # Split if we find two pieces of punctuation in a row, even
 128         # if we should otherwise continue.
 129         elif i and categories[i-1][0] in "P" and category[0] in "P":
 130             if letters:
 131                 aletters(u"".join(letters))
 132                 letters = []
 133                 previous = LETTER
 134             if digits:
 135                 adigits(u"".join(digits))
 136                 digits = []
 137                 previous = NUMBER
 138
 139         else:
 140             if digits:
 141                 digits.append(uchar)
 142             elif letters:
 143                 letters.append(uchar)
 144
 145     if letters and previous == LETTER and words:
 146         word = stripends(words.pop()[1].strip()) + BREAKER
 147         letters.insert(0, word)
 148         previous = UNKNOWN
 149
 150     if letters:
 151         aletters(u"".join(letters))
 152     if digits:
 153         adigits(u"".join(digits))
 154
 155     return [(i, key(w) if w else u'') for i, w in words]
 156
 157 def numeric(orig, invalid=INFINITY):
 158     """Parse a number out of a string.
 159
 160     This function parses a unicode number out of the start of a
 161     string. If a number cannot be found at the start, the 'invalid'
 162     argument is returned.
 163
 164     """
 165
 166     if not orig:
 167         return invalid
 168
 169     string = unicode(orig)
 170     for uchar in string:
 171         if uchar.isnumeric():
 172             break
 173     else:
 174         return invalid
 175
 176     for char in string:
 177         if u"\u2160" <= char <= u"\u2188":
 178             return deroman(string)
 179
 180     mult = 1
 181     while string[:1] == u"-" or string[:1] == u"+":
 182         if string[:1] == u"-":
 183             mult = -mult
 184         string = string[1:]
 185
 186     if not string[:1].isnumeric():
 187         return invalid
 188
 189     string = normalize_number(string)
 190
 191     def _numeric(string):
 192         """Interpreter a number as base 10."""
 193         total = 0
 194         for uchar in string:
 195             number = unicodedata.numeric(uchar)
 196             if number >= 1 or number == 0:
 197                 total *= 10
 198             total += number
 199         return total
 200
 201     try:
 202         whole, frac = string.split(".")
 203         whole = _numeric(whole)
 204         frac = _numeric(frac) / (10.0 ** len(frac))
 205         return mult * (whole + frac)
 206     except ValueError:
 207         return mult * _numeric(string)
 208
 209 def normalize_number(string):
 210     """Normalize punctuation in a number.
 211
 212     This function attempts to guess which characters in a number
 213     represent grouping separators and which represent decimal
 214     points. It returns a string that is valid to pass to Python's
 215     float() routine (potentially, NaN, if nothing like a number is
 216     found).
 217
 218     """
 219
 220     string = unicode(string)
 221     string = filter(lambda u: u.isnumeric() or u in KEEP_IN_NUMBERS, string)
 222     string = string.strip(KEEP_IN_NUMBERS)
 223
 224     commas = string.count(u",")
 225     stops = string.count(u".")
 226     quotes = string.count(u"'")
 227
 228     # If anything occurs more than once, it's a separator.
 229     if commas > 1:
 230         string = string.replace(u",", u"")
 231         commas = 0
 232     if stops > 1:
 233         string = string.replace(u".", u"")
 234         stops = 0
 235     if quotes > 1:
 236         string = string.replace(u"'", u"")
 237         quotes = 0
 238
 239     def normalize_two(a, b, string):
 240         """One of each - assume the first is grouping, second is point."""
 241         a_idx = string.rindex(a)
 242         b_idx = string.rindex(b)
 243         if a_idx > b_idx:
 244             string = string.replace(b, u"").replace(a, u".")
 245         else:
 246             string = string.replace(a, u"").replace(b, u".")
 247         return string
 248
 249     if commas and stops and quotes:
 250         # If all three, assume the middle is the decimal point.
 251         # A,AAA.BB'CC
 252         # A.AAA,BB'CC
 253         # A,AAA'BB.CC
 254         # A.AAA'BB,CC
 255         # Not really valid, so do whatever we want...
 256         # A'AAA.BB,CC
 257         # A'AAA,BB.CC
 258         comma_idx = string.index(u",")
 259         stops_idx = string.index(u".")
 260         quotes_idx = string.index(u"'")
 261         if (comma_idx < stops_idx < quotes_idx
 262             or quotes_idx < stops_idx < comma_idx):
 263             string = string.replace(u",", u"").replace(u"'", u"")
 264         elif (comma_idx < quotes_idx < stops_idx
 265             or stops_idx < quotes_idx < comma_idx):
 266             string = string.replace(
 267                 u",", u"").replace(
 268                 u".", u"").replace(
 269                 u"'", u".")
 270         else:
 271             string = string.replace(
 272                 u"'", u"").replace(
 273                 u".", u"").replace(
 274                 u",", u".")
 275
 276     elif stops and quotes:
 277         string = normalize_two(u".", u"'", string)
 278
 279     elif commas and quotes:
 280         string = normalize_two(u",", u"'", string)
 281
 282     elif commas and stops:
 283         string = normalize_two(u",", u".", string)
 284
 285     elif commas:
 286         if string[-4:-3] == u"," and len(string) <= 7:
 287             # Single comma as a thousands separator.
 288             string = string.replace(u",", u"")
 289         else:
 290             # Single comma, not thousands - probably a decimal point.
 291             string = string.replace(u",", u".")
 292
 293     elif quotes:
 294         # Single quote, probably MM'SS", equivalent to a decimal point.
 295         string = string.replace(u"'", u".")
 296
 297     elif stops and string[-4:] == ".000":
 298         # Single stop, but no decimal - probably grouping.
 299         string = string.replace(u".", u"")
 300
 301     return string or "NaN"
 302
 303 def deroman(string):
 304     """Turn a Roman numeral into an integer."""
 305     string = unicodedata.normalize('NFKD', unicode(string)).lower()
 306     previous = 0
 307     building = 0
 308     for char in reversed(string):
 309         try:
 310             value = ROMAN[char]
 311         except KeyError:
 312             continue
 313         if value < previous:
 314             building -= value
 315         else:
 316             building += value
 317         previous = value
 318     return building