collate/strings.py

   1 """String utility functions for collation."""
   2
   3 __all__ = ["sortemes", "numeric", "normalize_number"]
   4
   5 import unicodedata
   6
   7 CONTINUE_ON = frozenset([
   8     "Ll", "Lm", "Lo", "Lt", "Lu",
   9     "Mc", "Me", "Mn",
  10     "Nd", "Nl", "No",
  11     "Po",
  12     "Zs",
  13     ])
  14
  15 UNKNOWN, LETTER, NUMBER = range(3)
  16
  17 BREAKER = u"\u2028" # Line break character
  18 HBREAKER = u"\u2029" # Paragraph break character
  19 INFINITY = float('inf')
  20
  21 KEEP_IN_NUMBERS = u"'.,"
  22 ALLOWED_IN_NUMBERS = KEEP_IN_NUMBERS + u"_"
  23
  24 def stripends(word):
  25     """Strip punctuation and symbols from the ends of a string."""
  26     while word and unicodedata.category(word[0])[0] in "PS":
  27         word = word[1:]
  28     while word and unicodedata.category(word[-1])[0] in "PS":
  29         word = word[:-1]
  30     return word
  31
  32 def sortemes(string, key=lambda s: s):
  33     """Generate a list of sortemes for the string.
  34
  35     A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
  36     sort information. This is larger than a word boundry but smaller
  37     than a sentence boundry; roughly, a sorteme boundry occurs between
  38     letters and numbers, between numbers and numbers if 'too much'
  39     punctuation exists in between, between lines.
  40
  41     There is no formal specification for sortemes; the goal of this
  42     function is to provide good output for Collator.sortemekey.
  43
  44     """
  45
  46     words = []
  47     letters = []
  48     digits = []
  49     if not string:
  50         return words
  51     string = unicode(string)
  52     categories = map(unicodedata.category, string)
  53     previous = UNKNOWN
  54
  55     def aletters(letters):
  56         """Add a group of letters to the word list."""
  57         words.append((INFINITY, stripends(letters)))
  58     def adigits(digits):
  59         """Add a group of digits to the word list."""
  60         words.append((numeric(digits), u''))
  61
  62     # TODO(jfw): This kind of evolved over time, there's probably a much
  63     # faster / more concise way to express it now.
  64     for i, (uchar, category) in enumerate(zip(string, categories)):
  65
  66         if letters and previous == LETTER and words:
  67             word = stripends(words.pop()[1].strip()) + BREAKER
  68             letters.insert(0, word)
  69             previous = UNKNOWN
  70
  71         # Split at the first letter following a number or
  72         # non-continuing character.
  73         if category[0] == "L":
  74             letters.append(uchar)
  75             if digits:
  76                 adigits(u"".join(digits).strip())
  77                 digits = []
  78                 previous = NUMBER
  79
  80         # Split at the first number following a non-number or
  81         # non-continuing character.
  82         elif category[0] == "N":
  83             digits.append(uchar)
  84             if letters:
  85                 if unicodedata.category(letters[-1])[0] == "L":
  86                     letters.append(HBREAKER)
  87                 aletters(u"".join(letters))
  88                 letters = []
  89                 previous = LETTER
  90
  91         # Only certain punctuation allowed in numbers.
  92         elif digits and uchar not in ALLOWED_IN_NUMBERS:
  93             adigits(u"".join(digits))
  94             digits = []
  95             previous = NUMBER
  96
  97         # Split if we find a non-continuing character ("weird" ones).
  98         elif letters and category not in CONTINUE_ON:
  99             if letters:
 100                 aletters(u"".join(letters).strip() + BREAKER)
 101                 letters = []
 102                 previous = LETTER
 103             if digits:
 104                 adigits(u"".join(digits).strip())
 105                 digits = []
 106                 previous = NUMBER
 107
 108         # Split if we find two pieces of punctuation in a row, even
 109         # if we should otherwise continue.
 110         elif i and categories[i-1][0] in "P" and category[0] in "P":
 111             if letters:
 112                 aletters(u"".join(letters))
 113                 letters = []
 114                 previous = LETTER
 115             if digits:
 116                 adigits(u"".join(digits))
 117                 digits = []
 118                 previous = NUMBER
 119
 120         else:
 121             if digits:
 122                 digits.append(uchar)
 123             elif letters:
 124                 letters.append(uchar)
 125
 126     if letters and previous == LETTER and words:
 127         word = stripends(words.pop()[1].strip()) + BREAKER
 128         letters.insert(0, word)
 129         previous = UNKNOWN
 130
 131     if letters:
 132         aletters(u"".join(letters))
 133     if digits:
 134         adigits(u"".join(digits))
 135
 136     return [(i, key(w) if w else u'') for i, w in words]
 137
 138 def numeric(orig, invalid=INFINITY):
 139     """Parse a number out of a string.
 140
 141     This function parses a unicode number out of the start of a
 142     string. If a number cannot be found at the start, the 'invalid'
 143     argument is returned.
 144
 145     """
 146
 147     if not orig:
 148         return invalid
 149
 150     string = unicode(orig)
 151     for uchar in string:
 152         if uchar.isnumeric():
 153             break
 154     else:
 155         return invalid
 156
 157     mult = 1
 158     while string[:1] == u"-" or string[:1] == u"+":
 159         if string[:1] == u"-":
 160             mult = -mult
 161         string = string[1:]
 162
 163     if not string[:1].isnumeric():
 164         return invalid
 165
 166     string = normalize_number(string)
 167
 168     def _numeric(string):
 169         """Interpreter a number as base 10."""
 170         total = 0
 171         for uchar in string:
 172             number = unicodedata.numeric(uchar)
 173             if number >= 1 or number == 0:
 174                 total *= 10
 175             total += number
 176         return total
 177
 178     try:
 179         whole, frac = string.split(".")
 180         whole = _numeric(whole)
 181         frac = _numeric(frac) / (10.0 ** len(frac))
 182         return mult * (whole + frac)
 183     except ValueError:
 184         return mult * _numeric(string)
 185
 186 def normalize_number(string):
 187     """Normalize punctuation in a number.
 188
 189     This function attempts to guess which characters in a number
 190     represent grouping separators and which represent decimal
 191     points. It returns a string that is valid to pass to Python's
 192     float() routine (potentially, NaN, if nothing like a number is
 193     found).
 194
 195     """
 196
 197     string = unicode(string)
 198     string = filter(lambda u: u.isnumeric() or u in KEEP_IN_NUMBERS, string)
 199     string = string.strip(KEEP_IN_NUMBERS)
 200
 201     commas = string.count(u",")
 202     stops = string.count(u".")
 203     quotes = string.count(u"'")
 204
 205     # If anything occurs more than once, it's a separator.
 206     if commas > 1:
 207         string = string.replace(u",", u"")
 208         commas = 0
 209     if stops > 1:
 210         string = string.replace(u".", u"")
 211         stops = 0
 212     if quotes > 1:
 213         string = string.replace(u"'", u"")
 214         quotes = 0
 215
 216     def normalize_two(a, b, string):
 217         """One of each - assume the first is grouping, second is point."""
 218         a_idx = string.rindex(a)
 219         b_idx = string.rindex(b)
 220         if a_idx > b_idx:
 221             string = string.replace(b, u"").replace(a, u".")
 222         else:
 223             string = string.replace(a, u"").replace(b, u".")
 224         return string
 225
 226     if commas and stops and quotes:
 227         # If all three, assume the middle is the decimal point.
 228         # A,AAA.BB'CC
 229         # A.AAA,BB'CC
 230         # A,AAA'BB.CC
 231         # A.AAA'BB,CC
 232         # Not really valid, so do whatever we want...
 233         # A'AAA.BB,CC
 234         # A'AAA,BB.CC
 235         comma_idx = string.index(u",")
 236         stops_idx = string.index(u".")
 237         quotes_idx = string.index(u"'")
 238         if (comma_idx < stops_idx < quotes_idx
 239             or quotes_idx < stops_idx < comma_idx):
 240             string = string.replace(u",", u"").replace(u"'", u"")
 241         elif (comma_idx < quotes_idx < stops_idx
 242             or stops_idx < quotes_idx < comma_idx):
 243             string = string.replace(
 244                 u",", u"").replace(
 245                 u".", u"").replace(
 246                 u"'", u".")
 247         else:
 248             string = string.replace(
 249                 u"'", u"").replace(
 250                 u".", u"").replace(
 251                 u",", u".")
 252
 253     elif stops and quotes:
 254         string = normalize_two(u".", u"'", string)
 255
 256     elif commas and quotes:
 257         string = normalize_two(u",", u"'", string)
 258
 259     elif commas and stops:
 260         string = normalize_two(u",", u".", string)
 261
 262     elif commas:
 263         if string[-4:-3] == u"," and len(string) <= 7:
 264             # Single comma as a thousands separator.
 265             string = string.replace(u",", u"")
 266         else:
 267             # Single comma, not thousands - probably a decimal point.
 268             string = string.replace(u",", u".")
 269
 270     elif quotes:
 271         # Single quote, probably MM'SS", equivalent to a decimal point.
 272         string = string.replace(u"'", u".")
 273
 274     elif stops and string[-4:] == ".000":
 275         # Single stop, but no decimal - probably grouping.
 276         string = string.replace(u".", u"")
 277
 278     return string or "NaN"