collate/strings.py

   1 import unicodedata
   2
   3 CONTINUE_ON = frozenset([
   4     "Ll", "Lm", "Lo", "Lt", "Lu",
   5     "Mc", "Me", "Mn",
   6     "Nd", "Nl", "No",
   7     "Po",
   8     "Zs",
   9     ])
  10
  11 UNKNOWN, LETTER, NUMBER = range(3)
  12
  13 BREAKER = u"\u2029"
  14
  15 def sortemes(string):
  16     """Generate a list of sortemes for the string.
  17
  18     A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
  19     sort information. This is larger than a word boundry but smaller
  20     than a sentence boundry; roughly, a sorteme boundry occurs between
  21     letters and numbers, between numbers and numbrs if 'too much'
  22     punctuation exists in between, between lines.
  23
  24     There is no formal specification for sortemes; the goal of this
  25     function is to provide good output for Collator.sortemekey.
  26     """
  27
  28     words = []
  29     if not string:
  30         return words
  31     string = unicode(string)
  32     start = None
  33     last = None
  34     mode = UNKNOWN
  35     previous_mode = UNKNOWN
  36     category = "XX"
  37
  38     # TODO(jfw): This kind of evolved over time, there's probably a much
  39     # faster / more concise way to express it now.
  40     for i, c in enumerate(string):
  41         broke = False
  42         prev_category = category
  43         this_mode = mode
  44         category = unicodedata.category(c)
  45
  46         # Split at the first letter following a number or
  47         # non-continuing character.
  48         if category[0] == "L":
  49             if mode != LETTER:
  50                 broke = True
  51                 mode = LETTER
  52
  53         # Split at the first number following a non-number or
  54         # non-continuing character.
  55         elif category[0] == "N":
  56             if mode != NUMBER:
  57                 broke = True
  58                 mode = NUMBER
  59
  60         # Split if we find a non-continuing character ("weird" ones).
  61         elif category not in CONTINUE_ON:
  62             broke = True
  63             mode = UNKNOWN
  64
  65         # Only certain punctuation allowed in numbers.
  66         elif mode == NUMBER and category[0] == "P" and c not in "',._":
  67             broke = True
  68             mode = UNKNOWN
  69
  70         # Split if we find two pieces of punctuation in a row, even
  71         # if we should otherwise continue.
  72         elif i > 0 and prev_category[0] == "P" and category[0] == "P":
  73             broke = True
  74             mode = UNKNOWN
  75
  76         if broke and start is not None and last is not None:
  77             # If we read two strings separated by weird punctuation,
  78             # pretend the punctuation isn't there.
  79             if (this_mode == previous_mode == LETTER
  80                 and words):
  81                 words[-1] += BREAKER + string[start:last+1]
  82             else:
  83                 # This ensures "foo2 bar" sorts as ["foo ", 2, "bar"]
  84                 # Which sorts after ["foo", "bar"].
  85                 if this_mode == NUMBER and previous_mode == LETTER and words:
  86                     words[-1] += BREAKER
  87                 words.append(string[start:last+1])
  88                 previous_mode = this_mode
  89
  90         if broke:
  91             start = i
  92             last = None
  93         if category[0] in "LN":
  94             last = i
  95     this_mode = mode
  96     if start is not None and last is not None:
  97         if this_mode == LETTER and previous_mode == LETTER and words:
  98             words[-1] += BREAKER + string[start:last+1]
  99         else:
 100             if this_mode == NUMBER and previous_mode == LETTER and words:
 101                 words[-1] += BREAKER
 102             words.append(string[start:last+1])
 103     return words
 104
 105 def numeric(orig, invalid=float('inf')):
 106     if not orig:
 107         return (invalid, '')
 108
 109     string = unicode(orig)
 110     for c in string:
 111         if c.isnumeric():
 112             break
 113     else:
 114         return (invalid, orig)
 115
 116     mult = 1
 117     while string[:1] == u"-" or string[:1] == u"+":
 118         if string[:1] == u"-":
 119             mult = -mult
 120         string = string[1:]
 121
 122     if not string[:1].isnumeric():
 123         return (invalid, orig)
 124
 125     string = normalize_punc(string)
 126
 127     # Early out if possible.
 128     try:
 129         return (float(string) * mult, orig)
 130     except ValueError:
 131         pass
 132
 133     # Otherwise we need to do this the hard way.
 134     def _numeric(string):
 135         total = 0
 136         for c in string:
 137             v = unicodedata.numeric(c)
 138             if v >= 1 or v == 0:
 139                 total *= 10
 140             total += v
 141         return total
 142
 143     try:
 144         whole, frac = string.split(".")
 145         whole = _numeric(whole)
 146         frac = _numeric(frac) / (10.0 ** len(frac))
 147         return (mult * (whole + frac), orig)
 148     except ValueError:
 149         return (mult * _numeric(string), orig)
 150
 151 def normalize_punc(string):
 152     string = unicode(string.strip(u",.'"))
 153     string = filter(lambda u: u.isnumeric() or u in u",.'", string)
 154     commas = string.count(u",")
 155     stops = string.count(u".")
 156     quotes = string.count(u"'")
 157
 158     # If anything occurs more than once, it's a separator.
 159     if commas > 1:
 160         string = string.replace(u",", u"")
 161         commas = 0
 162     if stops > 1:
 163         string = string.replace(u".", u"")
 164         stops = 0
 165     if quotes > 1:
 166         string = string.replace(u"'", u"")
 167         quotes = 0
 168
 169     def normalize_two(a, b, string):
 170         # One of each - assume the first is grouping, second is point.
 171         a_idx = string.rindex(a)
 172         b_idx = string.rindex(b)
 173         if a_idx > b_idx:
 174             string = string.replace(b, u"").replace(a, u".")
 175         else:
 176             string = string.replace(a, u"").replace(b, u".")
 177         return string
 178
 179     if commas and stops and quotes:
 180         # If all three, assume the middle is the decimal point.
 181         # A,AAA.BB'CC
 182         # A.AAA,BB'CC
 183         # A,AAA'BB.CC
 184         # A.AAA'BB,CC
 185         # Not really valid, so do whatever we want...
 186         # A'AAA.BB,CC
 187         # A'AAA,BB.CC
 188         comma_idx = string.index(u",")
 189         stops_idx = string.index(u".")
 190         quotes_idx = string.index(u"'")
 191         if (comma_idx < stops_idx < quotes_idx
 192             or quotes_idx < stops_idx < comma_idx):
 193             string = string.replace(u",", u"").replace(u"'", u"")
 194         elif (comma_idx < quotes_idx < stops_idx
 195             or stops_idx < quotes_idx < comma_idx):
 196             string = string.replace(
 197                 u",", u"").replace(
 198                 u".", u"").replace(
 199                 u"'", u".")
 200         else:
 201             string = string.replace(
 202                 u"'", u"").replace(
 203                 u".", u"").replace(
 204                 u",", u".")
 205
 206     elif stops and quotes:
 207         string = normalize_two(u".", u"'", string)
 208
 209     elif commas and quotes:
 210         string = normalize_two(u",", u"'", string)
 211
 212     elif commas and stops:
 213         string = normalize_two(u",", u".", string)
 214
 215     elif commas:
 216         if string[-4:-3] == u"," and len(string) <= 7:
 217             # Single comma as a thousands separator.
 218             string = string.replace(u",", u"")
 219         else:
 220             # Single comma, not thousands - probably a decimal point.
 221             string = string.replace(u",", u".")
 222
 223     elif quotes:
 224         # Single quote, probably MM'SS", equivalent to a decimal point.
 225         string = string.replace(u"'", u".")
 226
 227     elif stops and string[-4:] == ".000":
 228         # Single stop, but no decimal - probably grouping.
 229         string = string.replace(u".", u"")
 230
 231     return string