collate/_strings.py

   1 import unicodedata
   2
   3 CONTINUE_ON = frozenset([
   4     "Ll", "Lm", "Lo", "Lt", "Lu",
   5     "Mc", "Me", "Mn",
   6     "Nd", "Nl", "No",
   7     "Po",
   8     "Zs",
   9     ])
  10
  11 UNKNOWN, LETTER, NUMBER = range(3)
  12
  13 def sortemes(string):
  14     """Generate a list of sortemes for the string.
  15
  16     A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
  17     sort information. This is larger than a word boundry but smaller
  18     than a sentence boundry; roughly, a sorteme boundry occurs between
  19     letters and numbers, between numbers and numbrs if 'too much'
  20     punctuation exists in between, between lines.
  21
  22     There is no formal specification for sortemes; the goal of this
  23     function is to provide good output for Collator.sortemekey.
  24     """
  25
  26     words = []
  27     if not string:
  28         return words
  29     string = unicode(string)
  30     start = None
  31     last = None
  32     mode = UNKNOWN
  33     previous_mode = UNKNOWN
  34     category = "XX"
  35     for i, c in enumerate(string):
  36         broke = False
  37         prev_category = category
  38         this_mode = mode
  39         category = unicodedata.category(c)
  40
  41         # Split at the first letter following a number or
  42         # non-continuing character.
  43         if category[0] == "L":
  44             if mode != LETTER:
  45                 broke = True
  46                 mode = LETTER
  47
  48         # Split at the first number following a non-number or
  49         # non-continuing character.
  50         elif category[0] == "N":
  51             if mode != NUMBER:
  52                 broke = True
  53                 mode = NUMBER
  54
  55         # Split if we find a non-continuing character ("weird" ones).
  56         elif category not in CONTINUE_ON:
  57             broke = True
  58             mode = UNKNOWN
  59
  60         # Only certain punctuation allowed in numbers.
  61         elif mode == NUMBER and category[0] == "P" and c not in "',._":
  62             broke = True
  63             mode = UNKNOWN
  64
  65         # Split if we find two pieces of punctuation in a row, even
  66         # if we should otherwise continue.
  67         elif i > 0 and prev_category[0] == "P" and category[0] == "P":
  68             broke = True
  69             mode = UNKNOWN
  70
  71         if broke and start is not None and last is not None:
  72             # If we read two strings separated by weird punctuation,
  73             # pretend the punctuation isn't there.
  74             if (this_mode == previous_mode == LETTER
  75                 and (category[0] == "P" or prev_category[0] == "P")
  76                 and words):
  77                 words[-1] += u" " + string[start:last+1]
  78             else:
  79                 # This ensures "foo2 bar" sorts as ["foo ", 2, "bar"]
  80                 # Which sorts after ["foo", "bar"].
  81                 if this_mode == NUMBER and previous_mode == LETTER and words:
  82                     words[-1] += u" "
  83                 words.append(string[start:last+1])
  84                 previous_mode = this_mode
  85
  86         if broke:
  87             start = i
  88             last = None
  89         if category[0] in "LN":
  90             last = i
  91     this_mode = mode
  92     if start is not None and last is not None:
  93         if this_mode == LETTER and previous_mode == LETTER and words:
  94             words[-1] += u" " + string[start:last+1]
  95         else:
  96             if this_mode == NUMBER and previous_mode == LETTER and words:
  97                 words[-1] += u" "
  98             words.append(string[start:last+1])
  99     return words
 100
 101 def numeric(orig, invalid=float('inf')):
 102     if not orig:
 103         return (invalid, '')
 104
 105     string = unicode(orig)
 106     for c in string:
 107         if c.isnumeric():
 108             break
 109     else:
 110         return (invalid, orig)
 111
 112     mult = 1
 113     while string[:1] == u"-" or string[:1] == u"+":
 114         if string[:1] == u"-":
 115             mult = -mult
 116         string = string[1:]
 117
 118     if not string[:1].isnumeric():
 119         return (invalid, orig)
 120
 121     string = normalize_punc(string)
 122
 123     # Early out if possible.
 124     try:
 125         return (float(string) * mult, orig)
 126     except ValueError:
 127         pass
 128
 129     # Otherwise we need to do this the hard way.
 130     def _numeric(string):
 131         total = 0
 132         for c in string:
 133             v = unicodedata.numeric(c)
 134             if v >= 1 or v == 0:
 135                 total *= 10
 136             total += v
 137         return total
 138
 139     try:
 140         whole, frac = string.split(".")
 141         whole = _numeric(whole)
 142         frac = _numeric(frac) / (10.0 ** len(frac))
 143         return (mult * (whole + frac), orig)
 144     except ValueError:
 145         return (mult * _numeric(string), orig)
 146
 147 def normalize_punc(string):
 148     string = unicode(string.strip(u",.'"))
 149     string = filter(lambda u: u.isnumeric() or u in u",.'", string)
 150     commas = string.count(u",")
 151     stops = string.count(u".")
 152     quotes = string.count(u"'")
 153
 154     # If anything occurs more than once, it's a separator.
 155     if commas > 1:
 156         string = string.replace(u",", u"")
 157         commas = 0
 158     if stops > 1:
 159         string = string.replace(u".", u"")
 160         stops = 0
 161     if quotes > 1:
 162         string = string.replace(u"'", u"")
 163         quotes = 0
 164
 165     def normalize_two(a, b, string):
 166         # One of each - assume the first is grouping, second is point.
 167         a_idx = string.rindex(a)
 168         b_idx = string.rindex(b)
 169         if a_idx > b_idx:
 170             string = string.replace(b, u"").replace(a, u".")
 171         else:
 172             string = string.replace(a, u"").replace(b, u".")
 173         return string
 174
 175     if commas and stops and quotes:
 176         # If all three, assume the middle is the decimal point.
 177         # A,AAA.BB'CC
 178         # A.AAA,BB'CC
 179         # A,AAA'BB.CC
 180         # A.AAA'BB,CC
 181         # Not really valid, so do whatever we want...
 182         # A'AAA.BB,CC
 183         # A'AAA,BB.CC
 184         comma_idx = string.index(u",")
 185         stops_idx = string.index(u".")
 186         quotes_idx = string.index(u"'")
 187         if (comma_idx < stops_idx < quotes_idx
 188             or quotes_idx < stops_idx < comma_idx):
 189             string = string.replace(u",", u"").replace(u"'", u"")
 190         elif (comma_idx < quotes_idx < stops_idx
 191             or stops_idx < quotes_idx < comma_idx):
 192             string = string.replace(
 193                 u",", u"").replace(
 194                 u".", u"").replace(
 195                 u"'", u".")
 196         else:
 197             string = string.replace(
 198                 u"'", u"").replace(
 199                 u".", u"").replace(
 200                 u",", u".")
 201
 202     elif stops and quotes:
 203         string = normalize_two(u".", u"'", string)
 204
 205     elif commas and quotes:
 206         string = normalize_two(u",", u"'", string)
 207
 208     elif commas and stops:
 209         string = normalize_two(u",", u".", string)
 210
 211     elif commas:
 212         if string[-4:-3] == u"," and len(string) <= 7:
 213             # Single comma as a thousands separator.
 214             string = string.replace(u",", u"")
 215         else:
 216             # Single comma, not thousands - probably a decimal point.
 217             string = string.replace(u",", u".")
 218
 219     elif quotes:
 220         # Single quote, probably MM'SS", equivalent to a decimal point.
 221         string = string.replace(u"'", u".")
 222
 223     elif stops and string[-4:] == ".000":
 224         # Single stop, but no decimal - probably grouping.
 225         string = string.replace(u".", u"")
 226
 227     return string