collate/strings.py

   1 """String utility functions for collation."""
   2
   3 __all__ = ["sortemes", "numeric", "normalize_number"]
   4
   5 import unicodedata
   6
   7 CONTINUE_ON = frozenset([
   8     "Ll", "Lm", "Lo", "Lt", "Lu",
   9     "Mc", "Me", "Mn",
  10     "Nd", "Nl", "No",
  11     "Po",
  12     "Zs",
  13     ])
  14
  15 UNKNOWN, LETTER, NUMBER = range(3)
  16
  17 BREAKER = u"\u2029" # Paragraph break character
  18 INFINITY = float('inf')
  19
  20 KEEP_IN_NUMBERS = u"'.,"
  21 ALLOWED_IN_NUMBERS = KEEP_IN_NUMBERS + u"_"
  22
  23 def stripends(word):
  24     """Strip punctuation and symbols from the ends of a string."""
  25     while word and unicodedata.category(word[0])[0] in "PS":
  26         word = word[1:]
  27     while word and unicodedata.category(word[-1])[0] in "PS":
  28         word = word[:-1]
  29     return word
  30
  31 def sortemes(string, key=lambda s: s):
  32     """Generate a list of sortemes for the string.
  33
  34     A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
  35     sort information. This is larger than a word boundry but smaller
  36     than a sentence boundry; roughly, a sorteme boundry occurs between
  37     letters and numbers, between numbers and numbers if 'too much'
  38     punctuation exists in between, between lines.
  39
  40     There is no formal specification for sortemes; the goal of this
  41     function is to provide good output for Collator.sortemekey.
  42
  43     """
  44
  45     words = []
  46     letters = []
  47     digits = []
  48     if not string:
  49         return words
  50     string = unicode(string)
  51     categories = map(unicodedata.category, string)
  52     previous = UNKNOWN
  53
  54     def aletters(letters):
  55         """Add a group of letters to the word list."""
  56         words.append((INFINITY, stripends(letters)))
  57     def adigits(digits):
  58         """Add a group of digits to the word list."""
  59         words.append((numeric(digits), u''))
  60
  61     # TODO(jfw): This kind of evolved over time, there's probably a much
  62     # faster / more concise way to express it now.
  63     for i, (uchar, category) in enumerate(zip(string, categories)):
  64
  65         if letters and previous == LETTER and words:
  66             word = stripends(words.pop()[1].strip()) + BREAKER
  67             letters.insert(0, word)
  68             previous = UNKNOWN
  69
  70         # Split at the first letter following a number or
  71         # non-continuing character.
  72         if category[0] == "L":
  73             letters.append(uchar)
  74             if digits:
  75                 adigits(u"".join(digits).strip())
  76                 digits = []
  77                 previous = NUMBER
  78
  79         # Split at the first number following a non-number or
  80         # non-continuing character.
  81         elif category[0] == "N":
  82             digits.append(uchar)
  83             if letters:
  84                 aletters(u"".join(letters))
  85                 letters = []
  86                 previous = LETTER
  87
  88         # Only certain punctuation allowed in numbers.
  89         elif digits and uchar not in ALLOWED_IN_NUMBERS:
  90             adigits(u"".join(digits))
  91             digits = []
  92             previous = NUMBER
  93
  94         # Split if we find a non-continuing character ("weird" ones).
  95         elif letters and category not in CONTINUE_ON:
  96             if letters:
  97                 aletters(u"".join(letters).strip() + BREAKER)
  98                 letters = []
  99                 previous = LETTER
 100             if digits:
 101                 adigits(u"".join(digits).strip())
 102                 digits = []
 103                 previous = NUMBER
 104
 105         # Split if we find two pieces of punctuation in a row, even
 106         # if we should otherwise continue.
 107         elif i and categories[i-1][0] in "P" and category[0] in "P":
 108             if letters:
 109                 aletters(u"".join(letters))
 110                 letters = []
 111                 previous = LETTER
 112             if digits:
 113                 adigits(u"".join(digits))
 114                 digits = []
 115                 previous = NUMBER
 116
 117         else:
 118             if digits:
 119                 digits.append(uchar)
 120             elif letters:
 121                 letters.append(uchar)
 122
 123     if letters and previous == LETTER and words:
 124         word = stripends(words.pop()[1].strip()) + BREAKER
 125         letters.insert(0, word)
 126         previous = UNKNOWN
 127
 128     if letters:
 129         aletters(u"".join(letters))
 130     if digits:
 131         adigits(u"".join(digits))
 132
 133     return [(i, key(w) if w else u'') for i, w in words]
 134
 135 def numeric(orig, invalid=INFINITY):
 136     """Parse a number out of a string.
 137
 138     This function parses a unicode number out of the start of a
 139     string. If a number cannot be found at the start, the 'invalid'
 140     argument is returned.
 141
 142     """
 143
 144     if not orig:
 145         return invalid
 146
 147     string = unicode(orig)
 148     for uchar in string:
 149         if uchar.isnumeric():
 150             break
 151     else:
 152         return invalid
 153
 154     mult = 1
 155     while string[:1] == u"-" or string[:1] == u"+":
 156         if string[:1] == u"-":
 157             mult = -mult
 158         string = string[1:]
 159
 160     if not string[:1].isnumeric():
 161         return invalid
 162
 163     string = normalize_number(string)
 164
 165     def _numeric(string):
 166         """Interpreter a number as base 10."""
 167         total = 0
 168         for uchar in string:
 169             number = unicodedata.numeric(uchar)
 170             if number >= 1 or number == 0:
 171                 total *= 10
 172             total += number
 173         return total
 174
 175     try:
 176         whole, frac = string.split(".")
 177         whole = _numeric(whole)
 178         frac = _numeric(frac) / (10.0 ** len(frac))
 179         return mult * (whole + frac)
 180     except ValueError:
 181         return mult * _numeric(string)
 182
 183 def normalize_number(string):
 184     """Normalize punctuation in a number.
 185
 186     This function attempts to guess which characters in a number
 187     represent grouping separators and which represent decimal
 188     points. It returns a string that is valid to pass to Python's
 189     float() routine (potentially, NaN, if nothing like a number is
 190     found).
 191
 192     """
 193
 194     string = unicode(string)
 195     string = filter(lambda u: u.isnumeric() or u in KEEP_IN_NUMBERS, string)
 196     string = string.strip(KEEP_IN_NUMBERS)
 197
 198     commas = string.count(u",")
 199     stops = string.count(u".")
 200     quotes = string.count(u"'")
 201
 202     # If anything occurs more than once, it's a separator.
 203     if commas > 1:
 204         string = string.replace(u",", u"")
 205         commas = 0
 206     if stops > 1:
 207         string = string.replace(u".", u"")
 208         stops = 0
 209     if quotes > 1:
 210         string = string.replace(u"'", u"")
 211         quotes = 0
 212
 213     def normalize_two(a, b, string):
 214         """One of each - assume the first is grouping, second is point."""
 215         a_idx = string.rindex(a)
 216         b_idx = string.rindex(b)
 217         if a_idx > b_idx:
 218             string = string.replace(b, u"").replace(a, u".")
 219         else:
 220             string = string.replace(a, u"").replace(b, u".")
 221         return string
 222
 223     if commas and stops and quotes:
 224         # If all three, assume the middle is the decimal point.
 225         # A,AAA.BB'CC
 226         # A.AAA,BB'CC
 227         # A,AAA'BB.CC
 228         # A.AAA'BB,CC
 229         # Not really valid, so do whatever we want...
 230         # A'AAA.BB,CC
 231         # A'AAA,BB.CC
 232         comma_idx = string.index(u",")
 233         stops_idx = string.index(u".")
 234         quotes_idx = string.index(u"'")
 235         if (comma_idx < stops_idx < quotes_idx
 236             or quotes_idx < stops_idx < comma_idx):
 237             string = string.replace(u",", u"").replace(u"'", u"")
 238         elif (comma_idx < quotes_idx < stops_idx
 239             or stops_idx < quotes_idx < comma_idx):
 240             string = string.replace(
 241                 u",", u"").replace(
 242                 u".", u"").replace(
 243                 u"'", u".")
 244         else:
 245             string = string.replace(
 246                 u"'", u"").replace(
 247                 u".", u"").replace(
 248                 u",", u".")
 249
 250     elif stops and quotes:
 251         string = normalize_two(u".", u"'", string)
 252
 253     elif commas and quotes:
 254         string = normalize_two(u",", u"'", string)
 255
 256     elif commas and stops:
 257         string = normalize_two(u",", u".", string)
 258
 259     elif commas:
 260         if string[-4:-3] == u"," and len(string) <= 7:
 261             # Single comma as a thousands separator.
 262             string = string.replace(u",", u"")
 263         else:
 264             # Single comma, not thousands - probably a decimal point.
 265             string = string.replace(u",", u".")
 266
 267     elif quotes:
 268         # Single quote, probably MM'SS", equivalent to a decimal point.
 269         string = string.replace(u"'", u".")
 270
 271     elif stops and string[-4:] == ".000":
 272         # Single stop, but no decimal - probably grouping.
 273         string = string.replace(u".", u"")
 274
 275     return string or "NaN"