collate/strings.py

   1 import unicodedata
   2
   3 CONTINUE_ON = frozenset([
   4     "Ll", "Lm", "Lo", "Lt", "Lu",
   5     "Mc", "Me", "Mn",
   6     "Nd", "Nl", "No",
   7     "Po",
   8     "Zs",
   9     ])
  10
  11 UNKNOWN, LETTER, NUMBER = range(3)
  12
  13 BREAKER = u"\u2029"
  14
  15 def sortemes(string):
  16     """Generate a list of sortemes for the string.
  17
  18     A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
  19     sort information. This is larger than a word boundry but smaller
  20     than a sentence boundry; roughly, a sorteme boundry occurs between
  21     letters and numbers, between numbers and numbrs if 'too much'
  22     punctuation exists in between, between lines.
  23
  24     There is no formal specification for sortemes; the goal of this
  25     function is to provide good output for Collator.sortemekey.
  26     """
  27
  28     words = []
  29     if not string:
  30         return words
  31     string = unicode(string)
  32     start = None
  33     last = None
  34     mode = UNKNOWN
  35     previous_mode = UNKNOWN
  36     category = "XX"
  37
  38     # TODO(jfw): This kind of evolved over time, there's probably a much
  39     # faster / more concise way to express it now.
  40     for i, c in enumerate(string):
  41         broke = False
  42         prev_category = category
  43         this_mode = mode
  44         category = unicodedata.category(c)
  45
  46         # Split at the first letter following a number or
  47         # non-continuing character.
  48         if category[0] == "L":
  49             if mode != LETTER:
  50                 broke = True
  51                 mode = LETTER
  52
  53         # Split at the first number following a non-number or
  54         # non-continuing character.
  55         elif category[0] == "N":
  56             if mode != NUMBER:
  57                 broke = True
  58                 mode = NUMBER
  59
  60         # Split if we find a non-continuing character ("weird" ones).
  61         elif category not in CONTINUE_ON:
  62             broke = True
  63             mode = UNKNOWN
  64
  65         # Only certain punctuation allowed in numbers.
  66         elif mode == NUMBER and category[0] == "P" and c not in "',._":
  67             broke = True
  68             mode = UNKNOWN
  69
  70         # Split if we find two pieces of punctuation in a row, even
  71         # if we should otherwise continue.
  72         elif prev_category[0] in "P" and category[0] in "P":
  73             broke = True
  74             mode = UNKNOWN
  75
  76         if broke and start is not None and last is not None:
  77             # If we read two strings separated by weird punctuation,
  78             # pretend the punctuation isn't there.
  79             if this_mode == previous_mode == LETTER:
  80                 words[-1] += BREAKER + string[start:last+1]
  81             else:
  82                 if this_mode == NUMBER and previous_mode == LETTER:
  83                     words[-1] += BREAKER
  84                 words.append(string[start:last+1])
  85             previous_mode = this_mode
  86
  87         if broke:
  88             start = i
  89             last = None
  90         if category[0] in "LN":
  91             last = i
  92     this_mode = mode
  93     if start is not None and last is not None:
  94         if this_mode == LETTER and previous_mode == LETTER and words:
  95             words[-1] += BREAKER + string[start:last+1]
  96         else:
  97             if this_mode == NUMBER and previous_mode == LETTER and words:
  98                 words[-1] += BREAKER
  99             words.append(string[start:last+1])
 100     return words
 101
 102 def numeric(orig, invalid=float('inf')):
 103     if not orig:
 104         return (invalid, '')
 105
 106     string = unicode(orig)
 107     for c in string:
 108         if c.isnumeric():
 109             break
 110     else:
 111         return (invalid, orig)
 112
 113     mult = 1
 114     while string[:1] == u"-" or string[:1] == u"+":
 115         if string[:1] == u"-":
 116             mult = -mult
 117         string = string[1:]
 118
 119     if not string[:1].isnumeric():
 120         return (invalid, orig)
 121
 122     string = normalize_punc(string)
 123
 124     # Early out if possible.
 125     try:
 126         return (float(string) * mult, orig)
 127     except ValueError:
 128         pass
 129
 130     # Otherwise we need to do this the hard way.
 131     def _numeric(string):
 132         total = 0
 133         for c in string:
 134             v = unicodedata.numeric(c)
 135             if v >= 1 or v == 0:
 136                 total *= 10
 137             total += v
 138         return total
 139
 140     try:
 141         whole, frac = string.split(".")
 142         whole = _numeric(whole)
 143         frac = _numeric(frac) / (10.0 ** len(frac))
 144         return (mult * (whole + frac), orig)
 145     except ValueError:
 146         return (mult * _numeric(string), orig)
 147
 148 def normalize_punc(string):
 149     string = unicode(string.strip(u",.'"))
 150     string = filter(lambda u: u.isnumeric() or u in u",.'", string)
 151     commas = string.count(u",")
 152     stops = string.count(u".")
 153     quotes = string.count(u"'")
 154
 155     # If anything occurs more than once, it's a separator.
 156     if commas > 1:
 157         string = string.replace(u",", u"")
 158         commas = 0
 159     if stops > 1:
 160         string = string.replace(u".", u"")
 161         stops = 0
 162     if quotes > 1:
 163         string = string.replace(u"'", u"")
 164         quotes = 0
 165
 166     def normalize_two(a, b, string):
 167         # One of each - assume the first is grouping, second is point.
 168         a_idx = string.rindex(a)
 169         b_idx = string.rindex(b)
 170         if a_idx > b_idx:
 171             string = string.replace(b, u"").replace(a, u".")
 172         else:
 173             string = string.replace(a, u"").replace(b, u".")
 174         return string
 175
 176     if commas and stops and quotes:
 177         # If all three, assume the middle is the decimal point.
 178         # A,AAA.BB'CC
 179         # A.AAA,BB'CC
 180         # A,AAA'BB.CC
 181         # A.AAA'BB,CC
 182         # Not really valid, so do whatever we want...
 183         # A'AAA.BB,CC
 184         # A'AAA,BB.CC
 185         comma_idx = string.index(u",")
 186         stops_idx = string.index(u".")
 187         quotes_idx = string.index(u"'")
 188         if (comma_idx < stops_idx < quotes_idx
 189             or quotes_idx < stops_idx < comma_idx):
 190             string = string.replace(u",", u"").replace(u"'", u"")
 191         elif (comma_idx < quotes_idx < stops_idx
 192             or stops_idx < quotes_idx < comma_idx):
 193             string = string.replace(
 194                 u",", u"").replace(
 195                 u".", u"").replace(
 196                 u"'", u".")
 197         else:
 198             string = string.replace(
 199                 u"'", u"").replace(
 200                 u".", u"").replace(
 201                 u",", u".")
 202
 203     elif stops and quotes:
 204         string = normalize_two(u".", u"'", string)
 205
 206     elif commas and quotes:
 207         string = normalize_two(u",", u"'", string)
 208
 209     elif commas and stops:
 210         string = normalize_two(u",", u".", string)
 211
 212     elif commas:
 213         if string[-4:-3] == u"," and len(string) <= 7:
 214             # Single comma as a thousands separator.
 215             string = string.replace(u",", u"")
 216         else:
 217             # Single comma, not thousands - probably a decimal point.
 218             string = string.replace(u",", u".")
 219
 220     elif quotes:
 221         # Single quote, probably MM'SS", equivalent to a decimal point.
 222         string = string.replace(u"'", u".")
 223
 224     elif stops and string[-4:] == ".000":
 225         # Single stop, but no decimal - probably grouping.
 226         string = string.replace(u".", u"")
 227
 228     return string