collate/strings.py

   1 import unicodedata
   2
   3 CONTINUE_ON = frozenset([
   4     "Ll", "Lm", "Lo", "Lt", "Lu",
   5     "Mc", "Me", "Mn",
   6     "Nd", "Nl", "No",
   7     "Po",
   8     "Zs",
   9     ])
  10
  11 UNKNOWN, LETTER, NUMBER = range(3)
  12
  13 BREAKER = u"\u2029" # Paragraph break character
  14 INFINITY = float('inf')
  15
  16 def sortemes(string, key=lambda s: s):
  17     """Generate a list of sortemes for the string.
  18
  19     A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
  20     sort information. This is larger than a word boundry but smaller
  21     than a sentence boundry; roughly, a sorteme boundry occurs between
  22     letters and numbers, between numbers and numbrs if 'too much'
  23     punctuation exists in between, between lines.
  24
  25     There is no formal specification for sortemes; the goal of this
  26     function is to provide good output for Collator.sortemekey.
  27     """
  28
  29     words = []
  30     letters = []
  31     digits = []
  32     if not string:
  33         return words
  34     string = unicode(string)
  35     categories = map(unicodedata.category, string)
  36     previous = UNKNOWN
  37     types = []
  38
  39     def stripends(word):
  40         while word and unicodedata.category(word[0])[0] in "PS":
  41             word = word[1:]
  42         while word and unicodedata.category(word[-1])[0] in "PS":
  43             word = word[:-1]
  44         return word
  45
  46     def aletters(letters):
  47         words.append((INFINITY, stripends(letters)))
  48     def adigits(digits):
  49         words.append((numeric(digits), u''))
  50
  51     # TODO(jfw): This kind of evolved over time, there's probably a much
  52     # faster / more concise way to express it now.
  53     for i, (c, category) in enumerate(zip(string, categories)):
  54
  55         if letters and previous == LETTER and words:
  56             word = stripends(words.pop()[1].strip()) + BREAKER
  57             letters.insert(0, word)
  58             previous = UNKNOWN
  59
  60         # Split at the first letter following a number or
  61         # non-continuing character.
  62         if category[0] == "L":
  63             letters.append(c)
  64             if digits:
  65                 adigits(u"".join(digits).strip())
  66                 digits = []
  67                 previous = NUMBER
  68
  69         # Split at the first number following a non-number or
  70         # non-continuing character.
  71         elif category[0] == "N":
  72             digits.append(c)
  73             if letters:
  74                 aletters(u"".join(letters))
  75                 letters = []
  76                 previous = LETTER
  77
  78         # Only certain punctuation allowed in numbers.
  79         elif digits and c not in "',._":
  80             adigits(u"".join(digits))
  81             digits = []
  82             previous = NUMBER
  83
  84         # Split if we find a non-continuing character ("weird" ones).
  85         elif letters and category not in CONTINUE_ON:
  86             if letters:
  87                 aletters(u"".join(letters).strip() + BREAKER)
  88                 letters = []
  89                 previous = LETTER
  90             if digits:
  91                 adigits(u"".join(digits).strip())
  92                 digits = []
  93                 previous = NUMBER
  94
  95         # Split if we find two pieces of punctuation in a row, even
  96         # if we should otherwise continue.
  97         elif i and categories[i-1][0] in "P" and category[0] in "P":
  98             if letters:
  99                 aletters(u"".join(letters))
 100                 letters = []
 101                 previous = LETTER
 102             if digits:
 103                 adigits(u"".join(digits))
 104                 digits = []
 105                 previous = NUMBER
 106
 107         else:
 108             if digits:
 109                 digits.append(c)
 110             elif letters:
 111                 letters.append(c)
 112
 113     if letters and previous == LETTER and words:
 114         word = stripends(words.pop()[1].strip()) + BREAKER
 115         letters.insert(0, word)
 116         previous = UNKNOWN
 117
 118     if letters:
 119         aletters(u"".join(letters))
 120     if digits:
 121         adigits(u"".join(digits))
 122
 123     return [(i, key(w) if w else u'') for i, w in words]
 124
 125 def numeric(orig, invalid=float('inf')):
 126     if not orig:
 127         return invalid
 128
 129     string = unicode(orig)
 130     for c in string:
 131         if c.isnumeric():
 132             break
 133     else:
 134         return invalid
 135
 136     mult = 1
 137     while string[:1] == u"-" or string[:1] == u"+":
 138         if string[:1] == u"-":
 139             mult = -mult
 140         string = string[1:]
 141
 142     if not string[:1].isnumeric():
 143         return (invalid, orig)
 144
 145     string = normalize_punc(string)
 146
 147     # Early out if possible.
 148     try:
 149         return float(string) * mult
 150     except ValueError:
 151         pass
 152
 153     # Otherwise we need to do this the hard way.
 154     def _numeric(string):
 155         total = 0
 156         for c in string:
 157             v = unicodedata.numeric(c)
 158             if v >= 1 or v == 0:
 159                 total *= 10
 160             total += v
 161         return total
 162
 163     try:
 164         whole, frac = string.split(".")
 165         whole = _numeric(whole)
 166         frac = _numeric(frac) / (10.0 ** len(frac))
 167         return mult * (whole + frac)
 168     except ValueError:
 169         return mult * _numeric(string)
 170
 171 def normalize_punc(string):
 172     string = unicode(string.strip(u",.'"))
 173     string = filter(lambda u: u.isnumeric() or u in u",.'", string)
 174     commas = string.count(u",")
 175     stops = string.count(u".")
 176     quotes = string.count(u"'")
 177
 178     # If anything occurs more than once, it's a separator.
 179     if commas > 1:
 180         string = string.replace(u",", u"")
 181         commas = 0
 182     if stops > 1:
 183         string = string.replace(u".", u"")
 184         stops = 0
 185     if quotes > 1:
 186         string = string.replace(u"'", u"")
 187         quotes = 0
 188
 189     def normalize_two(a, b, string):
 190         # One of each - assume the first is grouping, second is point.
 191         a_idx = string.rindex(a)
 192         b_idx = string.rindex(b)
 193         if a_idx > b_idx:
 194             string = string.replace(b, u"").replace(a, u".")
 195         else:
 196             string = string.replace(a, u"").replace(b, u".")
 197         return string
 198
 199     if commas and stops and quotes:
 200         # If all three, assume the middle is the decimal point.
 201         # A,AAA.BB'CC
 202         # A.AAA,BB'CC
 203         # A,AAA'BB.CC
 204         # A.AAA'BB,CC
 205         # Not really valid, so do whatever we want...
 206         # A'AAA.BB,CC
 207         # A'AAA,BB.CC
 208         comma_idx = string.index(u",")
 209         stops_idx = string.index(u".")
 210         quotes_idx = string.index(u"'")
 211         if (comma_idx < stops_idx < quotes_idx
 212             or quotes_idx < stops_idx < comma_idx):
 213             string = string.replace(u",", u"").replace(u"'", u"")
 214         elif (comma_idx < quotes_idx < stops_idx
 215             or stops_idx < quotes_idx < comma_idx):
 216             string = string.replace(
 217                 u",", u"").replace(
 218                 u".", u"").replace(
 219                 u"'", u".")
 220         else:
 221             string = string.replace(
 222                 u"'", u"").replace(
 223                 u".", u"").replace(
 224                 u",", u".")
 225
 226     elif stops and quotes:
 227         string = normalize_two(u".", u"'", string)
 228
 229     elif commas and quotes:
 230         string = normalize_two(u",", u"'", string)
 231
 232     elif commas and stops:
 233         string = normalize_two(u",", u".", string)
 234
 235     elif commas:
 236         if string[-4:-3] == u"," and len(string) <= 7:
 237             # Single comma as a thousands separator.
 238             string = string.replace(u",", u"")
 239         else:
 240             # Single comma, not thousands - probably a decimal point.
 241             string = string.replace(u",", u".")
 242
 243     elif quotes:
 244         # Single quote, probably MM'SS", equivalent to a decimal point.
 245         string = string.replace(u"'", u".")
 246
 247     elif stops and string[-4:] == ".000":
 248         # Single stop, but no decimal - probably grouping.
 249         string = string.replace(u".", u"")
 250
 251     return string