collate/strings.py

   1 import unicodedata
   2
   3 CONTINUE_ON = frozenset([
   4     "Ll", "Lm", "Lo", "Lt", "Lu",
   5     "Mc", "Me", "Mn",
   6     "Nd", "Nl", "No",
   7     "Po",
   8     "Zs",
   9     ])
  10
  11 UNKNOWN, LETTER, NUMBER = range(3)
  12
  13 BREAKER = u"\u2029" # Paragraph break character
  14 INFINITY = float('inf')
  15
  16 def sortemes(string, key=lambda s: s):
  17     """Generate a list of sortemes for the string.
  18
  19     A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
  20     sort information. This is larger than a word boundry but smaller
  21     than a sentence boundry; roughly, a sorteme boundry occurs between
  22     letters and numbers, between numbers and numbers if 'too much'
  23     punctuation exists in between, between lines.
  24
  25     There is no formal specification for sortemes; the goal of this
  26     function is to provide good output for Collator.sortemekey.
  27     """
  28
  29     words = []
  30     letters = []
  31     digits = []
  32     if not string:
  33         return words
  34     string = unicode(string)
  35     categories = map(unicodedata.category, string)
  36     previous = UNKNOWN
  37
  38     def stripends(word):
  39         while word and unicodedata.category(word[0])[0] in "PS":
  40             word = word[1:]
  41         while word and unicodedata.category(word[-1])[0] in "PS":
  42             word = word[:-1]
  43         return word
  44
  45     def aletters(letters):
  46         words.append((INFINITY, stripends(letters)))
  47     def adigits(digits):
  48         words.append((numeric(digits), u''))
  49
  50     # TODO(jfw): This kind of evolved over time, there's probably a much
  51     # faster / more concise way to express it now.
  52     for i, (c, category) in enumerate(zip(string, categories)):
  53
  54         if letters and previous == LETTER and words:
  55             word = stripends(words.pop()[1].strip()) + BREAKER
  56             letters.insert(0, word)
  57             previous = UNKNOWN
  58
  59         # Split at the first letter following a number or
  60         # non-continuing character.
  61         if category[0] == "L":
  62             letters.append(c)
  63             if digits:
  64                 adigits(u"".join(digits).strip())
  65                 digits = []
  66                 previous = NUMBER
  67
  68         # Split at the first number following a non-number or
  69         # non-continuing character.
  70         elif category[0] == "N":
  71             digits.append(c)
  72             if letters:
  73                 aletters(u"".join(letters))
  74                 letters = []
  75                 previous = LETTER
  76
  77         # Only certain punctuation allowed in numbers.
  78         elif digits and c not in "',._":
  79             adigits(u"".join(digits))
  80             digits = []
  81             previous = NUMBER
  82
  83         # Split if we find a non-continuing character ("weird" ones).
  84         elif letters and category not in CONTINUE_ON:
  85             if letters:
  86                 aletters(u"".join(letters).strip() + BREAKER)
  87                 letters = []
  88                 previous = LETTER
  89             if digits:
  90                 adigits(u"".join(digits).strip())
  91                 digits = []
  92                 previous = NUMBER
  93
  94         # Split if we find two pieces of punctuation in a row, even
  95         # if we should otherwise continue.
  96         elif i and categories[i-1][0] in "P" and category[0] in "P":
  97             if letters:
  98                 aletters(u"".join(letters))
  99                 letters = []
 100                 previous = LETTER
 101             if digits:
 102                 adigits(u"".join(digits))
 103                 digits = []
 104                 previous = NUMBER
 105
 106         else:
 107             if digits:
 108                 digits.append(c)
 109             elif letters:
 110                 letters.append(c)
 111
 112     if letters and previous == LETTER and words:
 113         word = stripends(words.pop()[1].strip()) + BREAKER
 114         letters.insert(0, word)
 115         previous = UNKNOWN
 116
 117     if letters:
 118         aletters(u"".join(letters))
 119     if digits:
 120         adigits(u"".join(digits))
 121
 122     return [(i, key(w) if w else u'') for i, w in words]
 123
 124 def numeric(orig, invalid=INFINITY):
 125     if not orig:
 126         return invalid
 127
 128     string = unicode(orig)
 129     for c in string:
 130         if c.isnumeric():
 131             break
 132     else:
 133         return invalid
 134
 135     mult = 1
 136     while string[:1] == u"-" or string[:1] == u"+":
 137         if string[:1] == u"-":
 138             mult = -mult
 139         string = string[1:]
 140
 141     if not string[:1].isnumeric():
 142         return (invalid, orig)
 143
 144     string = normalize_punc(string)
 145
 146     # Otherwise we need to do this the hard way.
 147     def _numeric(string):
 148         total = 0
 149         for c in string:
 150             v = unicodedata.numeric(c)
 151             if v >= 1 or v == 0:
 152                 total *= 10
 153             total += v
 154         return total
 155
 156     try:
 157         whole, frac = string.split(".")
 158         whole = _numeric(whole)
 159         frac = _numeric(frac) / (10.0 ** len(frac))
 160         return mult * (whole + frac)
 161     except ValueError:
 162         return mult * _numeric(string)
 163
 164 def normalize_punc(string):
 165     string = unicode(string.strip(u",.'"))
 166     string = filter(lambda u: u.isnumeric() or u in u",.'", string)
 167     commas = string.count(u",")
 168     stops = string.count(u".")
 169     quotes = string.count(u"'")
 170
 171     # If anything occurs more than once, it's a separator.
 172     if commas > 1:
 173         string = string.replace(u",", u"")
 174         commas = 0
 175     if stops > 1:
 176         string = string.replace(u".", u"")
 177         stops = 0
 178     if quotes > 1:
 179         string = string.replace(u"'", u"")
 180         quotes = 0
 181
 182     def normalize_two(a, b, string):
 183         # One of each - assume the first is grouping, second is point.
 184         a_idx = string.rindex(a)
 185         b_idx = string.rindex(b)
 186         if a_idx > b_idx:
 187             string = string.replace(b, u"").replace(a, u".")
 188         else:
 189             string = string.replace(a, u"").replace(b, u".")
 190         return string
 191
 192     if commas and stops and quotes:
 193         # If all three, assume the middle is the decimal point.
 194         # A,AAA.BB'CC
 195         # A.AAA,BB'CC
 196         # A,AAA'BB.CC
 197         # A.AAA'BB,CC
 198         # Not really valid, so do whatever we want...
 199         # A'AAA.BB,CC
 200         # A'AAA,BB.CC
 201         comma_idx = string.index(u",")
 202         stops_idx = string.index(u".")
 203         quotes_idx = string.index(u"'")
 204         if (comma_idx < stops_idx < quotes_idx
 205             or quotes_idx < stops_idx < comma_idx):
 206             string = string.replace(u",", u"").replace(u"'", u"")
 207         elif (comma_idx < quotes_idx < stops_idx
 208             or stops_idx < quotes_idx < comma_idx):
 209             string = string.replace(
 210                 u",", u"").replace(
 211                 u".", u"").replace(
 212                 u"'", u".")
 213         else:
 214             string = string.replace(
 215                 u"'", u"").replace(
 216                 u".", u"").replace(
 217                 u",", u".")
 218
 219     elif stops and quotes:
 220         string = normalize_two(u".", u"'", string)
 221
 222     elif commas and quotes:
 223         string = normalize_two(u",", u"'", string)
 224
 225     elif commas and stops:
 226         string = normalize_two(u",", u".", string)
 227
 228     elif commas:
 229         if string[-4:-3] == u"," and len(string) <= 7:
 230             # Single comma as a thousands separator.
 231             string = string.replace(u",", u"")
 232         else:
 233             # Single comma, not thousands - probably a decimal point.
 234             string = string.replace(u",", u".")
 235
 236     elif quotes:
 237         # Single quote, probably MM'SS", equivalent to a decimal point.
 238         string = string.replace(u"'", u".")
 239
 240     elif stops and string[-4:] == ".000":
 241         # Single stop, but no decimal - probably grouping.
 242         string = string.replace(u".", u"")
 243
 244     return string