collate/_strings.py

   1 import unicodedata
   2
   3 CONTINUE_ON = frozenset([
   4     "Ll", "Lm", "Lo", "Lt", "Lu",
   5     "Mc", "Me", "Mn",
   6     "Nd", "Nl", "No",
   7     "Po",
   8     "Zs",
   9     ])
  10
  11 UNKNOWN, LETTER, NUMBER = range(3)
  12
  13 def sortemes(string):
  14     """Generate a list of sortemes for the string.
  15
  16     A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
  17     sort information. This is larger than a word boundry but smaller
  18     than a sentence boundry; roughly, a sorteme boundry occurs between
  19     letters and numbers, between numbers and numbrs if 'too much'
  20     punctuation exists in between, between lines.
  21
  22     There is no formal specification for sortemes; the goal of this
  23     function is to provide good output for Collator.sortemekey.
  24     """
  25
  26     words = []
  27     if not string:
  28         return words
  29     string = unicode(string)
  30     start = None
  31     last = None
  32     mode = UNKNOWN
  33     previous_mode = UNKNOWN
  34     category = "XX"
  35     for i, c in enumerate(string):
  36         broke = False
  37         prev_category = category
  38         this_mode = mode
  39         category = unicodedata.category(c)
  40
  41         # Split at the first letter following a number or
  42         # non-continuing character.
  43         if category[0] == "L":
  44             if mode != LETTER:
  45                 broke = True
  46                 mode = LETTER
  47
  48         # Split at the first number following a non-number or
  49         # non-continuing character.
  50         elif category[0] == "N":
  51             if mode != NUMBER:
  52                 broke = True
  53                 mode = NUMBER
  54
  55         # Split if we find a non-continuing character ("weird" ones).
  56         elif category not in CONTINUE_ON:
  57             broke = True
  58             mode = UNKNOWN
  59
  60         # Only certain punctuation allowed in numbers.
  61         elif mode == NUMBER and category[0] == "P" and c not in "',._":
  62             broke = True
  63             mode = UNKNOWN
  64
  65         # Split if we find two pieces of punctuation in a row, even
  66         # if we should otherwise continue.
  67         elif i > 0 and prev_category[0] == "P" and category[0] == "P":
  68             broke = True
  69             mode = UNKNOWN
  70
  71         if broke and start is not None and last is not None:
  72             # If we read two strings separated by weird punctuation,
  73             # pretend the punctuation isn't there.
  74             if (this_mode == previous_mode == LETTER
  75                 and prev_category[0] == "P"
  76                 and words):
  77                 words[-1] += u" " + string[start:last+1]
  78             else:
  79                 # This ensures "foo2 bar" sorts as ["foo ", 2, "bar"]
  80                 # Which sorts after ["foo", "bar"].
  81                 if this_mode == NUMBER and previous_mode == LETTER and words:
  82                     words[-1] += u" "
  83                 words.append(string[start:last+1])
  84                 previous_mode = this_mode
  85
  86         if broke:
  87             start = i
  88             last = None
  89         if category[0] in "LN":
  90             last = i
  91     if start is not None and last is not None:
  92         if this_mode == previous_mode == LETTER and words:
  93             words[-1] += u" " + string[start:last+1]
  94         else:
  95             if this_mode == NUMBER and previous_mode == LETTER and words:
  96                 words[-1] += u" "
  97             words.append(string[start:last+1])
  98     return words
  99
 100 def numeric(orig, invalid=float('inf')):
 101     if not orig:
 102         return (invalid, '')
 103
 104     string = unicode(orig)
 105     for c in string:
 106         if c.isnumeric():
 107             break
 108     else:
 109         return (invalid, orig)
 110
 111     mult = 1
 112     while string[:1] == u"-" or string[:1] == u"+":
 113         if string[:1] == u"-":
 114             mult = -mult
 115         string = string[1:]
 116
 117     if not string[:1].isnumeric():
 118         return (invalid, orig)
 119
 120     string = normalize_punc(string)
 121
 122     # Early out if possible.
 123     try:
 124         return (float(string) * mult, orig)
 125     except ValueError:
 126         pass
 127
 128     # Otherwise we need to do this the hard way.
 129     def _numeric(string):
 130         total = 0
 131         for c in string:
 132             v = unicodedata.numeric(c)
 133             if v >= 1 or v == 0:
 134                 total *= 10
 135             total += v
 136         return total
 137
 138     try:
 139         whole, frac = string.split(".")
 140         whole = _numeric(whole)
 141         frac = _numeric(frac) / (10.0 ** len(frac))
 142         return (mult * (whole + frac), orig)
 143     except ValueError:
 144         return (mult * _numeric(string), orig)
 145
 146 def normalize_punc(string):
 147     string = unicode(string.strip(u",.'"))
 148     string = filter(lambda u: u.isnumeric() or u in u",.'", string)
 149     commas = string.count(u",")
 150     stops = string.count(u".")
 151     quotes = string.count(u"'")
 152
 153     # If anything occurs more than once, it's a separator.
 154     if commas > 1:
 155         string = string.replace(u",", u"")
 156         commas = 0
 157     if stops > 1:
 158         string = string.replace(u".", u"")
 159         stops = 0
 160     if quotes > 1:
 161         string = string.replace(u"'", u"")
 162         quotes = 0
 163
 164     def normalize_two(a, b, string):
 165         # One of each - assume the first is grouping, second is point.
 166         a_idx = string.rindex(a)
 167         b_idx = string.rindex(b)
 168         if a_idx > b_idx:
 169             string = string.replace(b, u"").replace(a, u".")
 170         else:
 171             string = string.replace(a, u"").replace(b, u".")
 172         return string
 173
 174     if commas and stops and quotes:
 175         # If all three, assume the middle is the decimal point.
 176         # A,AAA.BB'CC
 177         # A.AAA,BB'CC
 178         # A,AAA'BB.CC
 179         # A.AAA'BB,CC
 180         # Not really valid, so do whatever we want...
 181         # A'AAA.BB,CC
 182         # A'AAA,BB.CC
 183         comma_idx = string.index(u",")
 184         stops_idx = string.index(u".")
 185         quotes_idx = string.index(u"'")
 186         if (comma_idx < stops_idx < quotes_idx
 187             or quotes_idx < stops_idx < comma_idx):
 188             string = string.replace(u",", u"").replace(u"'", u"")
 189         elif (comma_idx < quotes_idx < stops_idx
 190             or stops_idx < quotes_idx < comma_idx):
 191             string = string.replace(
 192                 u",", u"").replace(
 193                 u".", u"").replace(
 194                 u"'", u".")
 195         else:
 196             string = string.replace(
 197                 u"'", u"").replace(
 198                 u".", u"").replace(
 199                 u",", u".")
 200
 201     elif stops and quotes:
 202         string = normalize_two(u".", u"'", string)
 203
 204     elif commas and quotes:
 205         string = normalize_two(u",", u"'", string)
 206
 207     elif commas and stops:
 208         string = normalize_two(u",", u".", string)
 209
 210     elif commas:
 211         if string[-4:-3] == u"," and len(string) <= 7:
 212             # Single comma as a thousands separator.
 213             string = string.replace(u",", u"")
 214         else:
 215             # Single comma, not thousands - probably a decimal point.
 216             string = string.replace(u",", u".")
 217
 218     elif quotes:
 219         # Single quote, probably MM'SS", equivalent to a decimal point.
 220         string = string.replace(u"'", u".")
 221
 222     elif stops and string[-4:] == ".000":
 223         # Single stop, but no decimal - probably grouping.
 224         string = string.replace(u".", u"")
 225
 226     return string