collate/strings.py

   1 """String utility functions for collation."""
   2
   3 __all__ = ["sortemes", "numeric", "normalize_number", "deroman"]
   4
   5 import unicodedata
   6
   7 CONTINUE_ON = frozenset([
   8     "Ll", "Lm", "Lo", "Lt", "Lu",
   9     "Mc", "Me", "Mn",
  10     "Nd", "Nl", "No",
  11     "Po",
  12     "Zs",
  13     ])
  14
  15 UNKNOWN, LETTER, NUMBER = range(3)
  16
  17 BREAKER = u"\u2028" # Line break character
  18 HBREAKER = u"\u2029" # Paragraph break character
  19 INFINITY = float('inf')
  20
  21 KEEP_IN_NUMBERS = u"'.,"
  22 ALLOWED_IN_NUMBERS = KEEP_IN_NUMBERS + u"_"
  23
  24 ROMAN = {
  25     u"i": 1,
  26     u"v": 5,
  27     u"x": 10,
  28     u"l": 50,
  29     u"c": 100,
  30     u"d": 500,
  31     u"m": 1000,
  32     u"\u2180": 1000,
  33     u"\u2181": 5000,
  34     u"\u2182": 10000,
  35     u"\u2183": 100,
  36     u"\u2184": 100,
  37     u"\u2185": 6,
  38     u"\u2186": 50,
  39     u"\u2187": 50000,
  40     u"\u2188": 100000,
  41     }
  42
  43 INITIAL_STOPS = frozenset([u"a", u"an", u"the"])
  44
  45 def stripends(word):
  46     """Strip punctuation and symbols from the ends of a string."""
  47     while word and unicodedata.category(word[0])[0] in "PS":
  48         word = word[1:]
  49     while word and unicodedata.category(word[-1])[0] in "PS":
  50         word = word[:-1]
  51     return word
  52
  53 def sortemes(string, key=lambda s: s):
  54     """Generate a list of sortemes for the string.
  55
  56     A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
  57     sort information. This is larger than a word boundry but smaller
  58     than a sentence boundry; roughly, a sorteme boundry occurs between
  59     letters and numbers, between numbers and numbers if 'too much'
  60     punctuation exists in between, between lines.
  61
  62     There is no formal specification for sortemes; the goal of this
  63     function is to provide good output for Collator.sortemekey.
  64
  65     """
  66
  67     if not string:
  68         return []
  69
  70     words = []
  71     letters = []
  72     digits = []
  73     lappend = letters.append
  74     dappend = digits.append
  75     string = unicode(string)
  76     categories = map(unicodedata.category, string)
  77     previous = UNKNOWN
  78     wappend = words.append
  79     join = u"".join
  80     i = 0
  81
  82     for uchar in string:
  83         category = categories[i]
  84
  85         if letters and previous == LETTER and words:
  86             word = stripends(words.pop()[1].strip()) + BREAKER
  87             letters.insert(0, word)
  88             previous = UNKNOWN
  89
  90         # Split at the first letter following a number or
  91         # non-continuing character.
  92         if category[0] == "L":
  93             lappend(uchar)
  94             if digits:
  95                 words.append((numeric(join(digits).strip()), u''))
  96                 del(digits[:])
  97                 previous = NUMBER
  98
  99         # Split at the first number following a non-number or
 100         # non-continuing character.
 101         elif category[0] == "N":
 102             dappend(uchar)
 103             if letters:
 104                 if unicodedata.category(letters[-1])[0] == "L":
 105                     lappend(HBREAKER)
 106                 wappend((INFINITY, stripends(join(letters))))
 107                 del(letters[:])
 108                 previous = LETTER
 109
 110         # Only certain punctuation allowed in numbers.
 111         elif digits and uchar not in ALLOWED_IN_NUMBERS:
 112             words.append((numeric(join(digits)), u''))
 113             del(digits[:])
 114             previous = NUMBER
 115
 116         # Split if we find a non-continuing character ("weird" ones).
 117         elif category not in CONTINUE_ON:
 118             if letters:
 119                 wappend(
 120                     (INFINITY,
 121                      stripends(join(letters).strip() + BREAKER)))
 122                 del(letters[:])
 123                 previous = LETTER
 124             if digits:
 125                 words.append((numeric(join(digits)), u''))
 126                 del(digits[:])
 127                 previous = NUMBER
 128
 129         # Split if we find two pieces of punctuation in a row, even
 130         # if we should otherwise continue.
 131         elif i and categories[i - 1][0] == category[0] == "P":
 132             if letters:
 133                 wappend((INFINITY, stripends(join(letters))))
 134                 del(letters[:])
 135                 previous = LETTER
 136             if digits:
 137                 words.append((numeric(join(digits)), u''))
 138                 del(digits[:])
 139                 previous = NUMBER
 140
 141         else:
 142             if digits:
 143                 dappend(uchar)
 144             elif letters:
 145                 lappend(uchar)
 146
 147         i += 1
 148
 149     if letters and previous == LETTER and words:
 150         word = stripends(words.pop()[1].strip()) + BREAKER
 151         letters.insert(0, word)
 152         previous = UNKNOWN
 153
 154     if letters:
 155         wappend((INFINITY, stripends(join(letters))))
 156     if digits:
 157         words.append((numeric(join(digits)), u''))
 158
 159     return [(i, key(w)) for i, w in words]
 160
 161 def numeric(orig, invalid=INFINITY):
 162     """Parse a number out of a string.
 163
 164     This function parses a unicode number out of the start of a
 165     string. If a number cannot be found at the start, the 'invalid'
 166     argument is returned.
 167
 168     """
 169
 170     if not orig:
 171         return invalid
 172
 173     string = unicode(orig)
 174     for uchar in string:
 175         if uchar.isnumeric():
 176             break
 177     else:
 178         return invalid
 179
 180     for char in string:
 181         if u"\u2160" <= char <= u"\u2188":
 182             return deroman(string)
 183
 184     mult = 1
 185     while string[:1] == u"-" or string[:1] == u"+":
 186         if string[:1] == u"-":
 187             mult = -mult
 188         string = string[1:]
 189
 190     if not string[:1].isnumeric():
 191         return invalid
 192
 193     string = normalize_number(string)
 194
 195     def _numeric(string):
 196         """Interpreter a number as base 10."""
 197         total = 0
 198         for uchar in string:
 199             number = unicodedata.numeric(uchar)
 200             if number >= 1 or number == 0:
 201                 total *= 10
 202             total += number
 203         return total
 204
 205     try:
 206         whole, frac = string.split(".")
 207         whole = _numeric(whole)
 208         frac = _numeric(frac) / (10.0 ** len(frac))
 209         return mult * (whole + frac)
 210     except ValueError:
 211         return mult * _numeric(string)
 212
 213 def normalize_number(string):
 214     """Normalize punctuation in a number.
 215
 216     This function attempts to guess which characters in a number
 217     represent grouping separators and which represent decimal
 218     points. It returns a string that is valid to pass to Python's
 219     float() routine (potentially, NaN, if nothing like a number is
 220     found).
 221
 222     """
 223
 224     string = unicode(string)
 225     string = filter(lambda u: u.isnumeric() or u in KEEP_IN_NUMBERS, string)
 226     string = string.strip(KEEP_IN_NUMBERS)
 227
 228     commas = string.count(u",")
 229     stops = string.count(u".")
 230     quotes = string.count(u"'")
 231
 232     # If anything occurs more than once, it's a separator.
 233     if commas > 1:
 234         string = string.replace(u",", u"")
 235         commas = 0
 236     if stops > 1:
 237         string = string.replace(u".", u"")
 238         stops = 0
 239     if quotes > 1:
 240         string = string.replace(u"'", u"")
 241         quotes = 0
 242
 243     def normalize_two(a, b, string):
 244         """One of each - assume the first is grouping, second is point."""
 245         a_idx = string.rindex(a)
 246         b_idx = string.rindex(b)
 247         if a_idx > b_idx:
 248             string = string.replace(b, u"").replace(a, u".")
 249         else:
 250             string = string.replace(a, u"").replace(b, u".")
 251         return string
 252
 253     if commas and stops and quotes:
 254         # If all three, assume the middle is the decimal point.
 255         # A,AAA.BB'CC
 256         # A.AAA,BB'CC
 257         # A,AAA'BB.CC
 258         # A.AAA'BB,CC
 259         # Not really valid, so do whatever we want...
 260         # A'AAA.BB,CC
 261         # A'AAA,BB.CC
 262         comma_idx = string.index(u",")
 263         stops_idx = string.index(u".")
 264         quotes_idx = string.index(u"'")
 265         if (comma_idx < stops_idx < quotes_idx
 266             or quotes_idx < stops_idx < comma_idx):
 267             string = string.replace(u",", u"").replace(u"'", u"")
 268         elif (comma_idx < quotes_idx < stops_idx
 269             or stops_idx < quotes_idx < comma_idx):
 270             string = string.replace(
 271                 u",", u"").replace(
 272                 u".", u"").replace(
 273                 u"'", u".")
 274         else:
 275             string = string.replace(
 276                 u"'", u"").replace(
 277                 u".", u"").replace(
 278                 u",", u".")
 279
 280     elif stops and quotes:
 281         string = normalize_two(u".", u"'", string)
 282
 283     elif commas and quotes:
 284         string = normalize_two(u",", u"'", string)
 285
 286     elif commas and stops:
 287         string = normalize_two(u",", u".", string)
 288
 289     elif commas:
 290         if string[-4:-3] == u"," and len(string) <= 7:
 291             # Single comma as a thousands separator.
 292             string = string.replace(u",", u"")
 293         else:
 294             # Single comma, not thousands - probably a decimal point.
 295             string = string.replace(u",", u".")
 296
 297     elif quotes:
 298         # Single quote, probably MM'SS", equivalent to a decimal point.
 299         string = string.replace(u"'", u".")
 300
 301     elif stops and string[-4:] == ".000":
 302         # Single stop, but no decimal - probably grouping.
 303         string = string.replace(u".", u"")
 304
 305     return string or "NaN"
 306
 307 def deroman(string):
 308     """Turn a Roman numeral into an integer."""
 309     string = unicodedata.normalize('NFKD', unicode(string)).lower()
 310     previous = 0
 311     building = 0
 312     for char in reversed(string):
 313         try:
 314             value = ROMAN[char]
 315         except KeyError:
 316             continue
 317         if value < previous:
 318             building -= value
 319         else:
 320             building += value
 321         previous = value
 322     return building