collate/strings.py

   1 import unicodedata
   2
   3 CONTINUE_ON = frozenset([
   4     "Ll", "Lm", "Lo", "Lt", "Lu",
   5     "Mc", "Me", "Mn",
   6     "Nd", "Nl", "No",
   7     "Po",
   8     "Zs",
   9     ])
  10
  11 UNKNOWN, LETTER, NUMBER = range(3)
  12
  13 BREAKER = u"\u2029" # Paragraph break character
  14
  15 def sortemes(string):
  16     """Generate a list of sortemes for the string.
  17
  18     A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
  19     sort information. This is larger than a word boundry but smaller
  20     than a sentence boundry; roughly, a sorteme boundry occurs between
  21     letters and numbers, between numbers and numbrs if 'too much'
  22     punctuation exists in between, between lines.
  23
  24     There is no formal specification for sortemes; the goal of this
  25     function is to provide good output for Collator.sortemekey.
  26     """
  27
  28     words = []
  29     letters = []
  30     digits = []
  31     if not string:
  32         return words
  33     string = unicode(string)
  34     categories = map(unicodedata.category, string)
  35     previous = UNKNOWN
  36     types = []
  37
  38     def stripends(word):
  39         while word and unicodedata.category(word[0])[0] in "PS":
  40             word = word[1:]
  41         while word and unicodedata.category(word[-1])[0] in "PS":
  42             word = word[:-1]
  43         return word
  44
  45     # TODO(jfw): This kind of evolved over time, there's probably a much
  46     # faster / more concise way to express it now.
  47     for i, (c, category) in enumerate(zip(string, categories)):
  48
  49         if letters and previous == LETTER and words:
  50             word = stripends(words.pop().strip())
  51             letters = list(stripends(word).strip() + BREAKER) + letters
  52             previous = UNKNOWN
  53
  54         # Split at the first letter following a number or
  55         # non-continuing character.
  56         if category[0] == "L":
  57             letters.append(c)
  58             if digits:
  59                 words.append(u"".join(digits).strip())
  60                 previous = NUMBER
  61                 digits = []
  62
  63         # Split at the first number following a non-number or
  64         # non-continuing character.
  65         elif category[0] == "N":
  66             digits.append(c)
  67             if letters:
  68                 words.append(u"".join(letters))
  69                 previous = LETTER
  70                 letters = []
  71
  72         # Only certain punctuation allowed in numbers.
  73         elif digits and c not in "',._":
  74             words.append(u"".join(digits))
  75             previous = NUMBER
  76             digits = []
  77
  78         # Split if we find a non-continuing character ("weird" ones).
  79         elif letters and category not in CONTINUE_ON:
  80             if letters:
  81                 words.append(u"".join(letters).strip() + BREAKER)
  82                 previous = LETTER
  83                 letters = []
  84             if digits:
  85                 words.append(u"".join(digits).strip() + BREAKER)
  86                 previous = NUMBER
  87                 digits = []
  88
  89         # Split if we find two pieces of punctuation in a row, even
  90         # if we should otherwise continue.
  91         elif i and categories[i-1][0] in "P" and category[0] in "P":
  92             if letters:
  93                 words.append(u"".join(letters))
  94                 previous = LETTER
  95                 letters = []
  96             if digits:
  97                 words.append(u"".join(digits))
  98                 previous = NUMBER
  99                 digits = []
 100
 101         else:
 102             if digits:
 103                 digits.append(c)
 104             elif letters:
 105                 letters.append(c)
 106
 107     if letters and previous == LETTER and words:
 108         word = stripends(words.pop().strip())
 109         letters = list(stripends(word).strip() + BREAKER) + letters
 110         previous = UNKNOWN
 111
 112     if letters:
 113         words.append(u"".join(letters))
 114         letters = []
 115     if digits:
 116         words.append(u"".join(digits))
 117         digits = []
 118
 119     words = map(stripends, words)
 120     return words
 121
 122 def numeric(orig, invalid=float('inf')):
 123     if not orig:
 124         return (invalid, '')
 125
 126     string = unicode(orig)
 127     for c in string:
 128         if c.isnumeric():
 129             break
 130     else:
 131         return (invalid, orig)
 132
 133     mult = 1
 134     while string[:1] == u"-" or string[:1] == u"+":
 135         if string[:1] == u"-":
 136             mult = -mult
 137         string = string[1:]
 138
 139     if not string[:1].isnumeric():
 140         return (invalid, orig)
 141
 142     string = normalize_punc(string)
 143
 144     # Early out if possible.
 145     try:
 146         return (float(string) * mult, orig)
 147     except ValueError:
 148         pass
 149
 150     # Otherwise we need to do this the hard way.
 151     def _numeric(string):
 152         total = 0
 153         for c in string:
 154             v = unicodedata.numeric(c)
 155             if v >= 1 or v == 0:
 156                 total *= 10
 157             total += v
 158         return total
 159
 160     try:
 161         whole, frac = string.split(".")
 162         whole = _numeric(whole)
 163         frac = _numeric(frac) / (10.0 ** len(frac))
 164         return (mult * (whole + frac), orig)
 165     except ValueError:
 166         return (mult * _numeric(string), orig)
 167
 168 def normalize_punc(string):
 169     string = unicode(string.strip(u",.'"))
 170     string = filter(lambda u: u.isnumeric() or u in u",.'", string)
 171     commas = string.count(u",")
 172     stops = string.count(u".")
 173     quotes = string.count(u"'")
 174
 175     # If anything occurs more than once, it's a separator.
 176     if commas > 1:
 177         string = string.replace(u",", u"")
 178         commas = 0
 179     if stops > 1:
 180         string = string.replace(u".", u"")
 181         stops = 0
 182     if quotes > 1:
 183         string = string.replace(u"'", u"")
 184         quotes = 0
 185
 186     def normalize_two(a, b, string):
 187         # One of each - assume the first is grouping, second is point.
 188         a_idx = string.rindex(a)
 189         b_idx = string.rindex(b)
 190         if a_idx > b_idx:
 191             string = string.replace(b, u"").replace(a, u".")
 192         else:
 193             string = string.replace(a, u"").replace(b, u".")
 194         return string
 195
 196     if commas and stops and quotes:
 197         # If all three, assume the middle is the decimal point.
 198         # A,AAA.BB'CC
 199         # A.AAA,BB'CC
 200         # A,AAA'BB.CC
 201         # A.AAA'BB,CC
 202         # Not really valid, so do whatever we want...
 203         # A'AAA.BB,CC
 204         # A'AAA,BB.CC
 205         comma_idx = string.index(u",")
 206         stops_idx = string.index(u".")
 207         quotes_idx = string.index(u"'")
 208         if (comma_idx < stops_idx < quotes_idx
 209             or quotes_idx < stops_idx < comma_idx):
 210             string = string.replace(u",", u"").replace(u"'", u"")
 211         elif (comma_idx < quotes_idx < stops_idx
 212             or stops_idx < quotes_idx < comma_idx):
 213             string = string.replace(
 214                 u",", u"").replace(
 215                 u".", u"").replace(
 216                 u"'", u".")
 217         else:
 218             string = string.replace(
 219                 u"'", u"").replace(
 220                 u".", u"").replace(
 221                 u",", u".")
 222
 223     elif stops and quotes:
 224         string = normalize_two(u".", u"'", string)
 225
 226     elif commas and quotes:
 227         string = normalize_two(u",", u"'", string)
 228
 229     elif commas and stops:
 230         string = normalize_two(u",", u".", string)
 231
 232     elif commas:
 233         if string[-4:-3] == u"," and len(string) <= 7:
 234             # Single comma as a thousands separator.
 235             string = string.replace(u",", u"")
 236         else:
 237             # Single comma, not thousands - probably a decimal point.
 238             string = string.replace(u",", u".")
 239
 240     elif quotes:
 241         # Single quote, probably MM'SS", equivalent to a decimal point.
 242         string = string.replace(u"'", u".")
 243
 244     elif stops and string[-4:] == ".000":
 245         # Single stop, but no decimal - probably grouping.
 246         string = string.replace(u".", u"")
 247
 248     return string