collate/_strings.py

   1 import unicodedata
   2
   3 def strip_punc(string):
   4     return filter(lambda c: unicodedata.category(c)[0] not in "PS", string)
   5
   6 def strip_ends(string):
   7     while string and unicodedata.category(string[0])[0] in "ZPS":
   8         string = string[1:]
   9     while string and unicodedata.category(string[-1])[0] in "ZPS":
  10         string = string[:-1]
  11     return string
  12
  13 def alnumsplit(string):
  14     if not string:
  15         return []
  16     string = unicode(string)
  17     strings = []
  18     numeric = None
  19     start = 0
  20     for i, char in enumerate(string):
  21         category = unicodedata.category(char)
  22         if numeric is None:
  23             broke = False
  24             if char.isnumeric():
  25                 numeric = True
  26             elif char.isalpha():
  27                 numeric = False
  28         elif numeric and char.isalpha():
  29             broke = True
  30             numeric = False
  31         elif numeric and category in ["Zs", "Ps", "Pe"]:
  32             broke = True
  33             numeric = None
  34         elif not numeric and char.isnumeric():
  35             broke = True
  36             numeric = True
  37         if broke:
  38             strings.append(strip_ends(string[start:i]))
  39             start = i
  40             broke = False
  41     strings.append(strip_ends(string[start:i + 1]))
  42     return strings
  43
  44 def wordlike(string):
  45     """Check if a string is 'word-like'.
  46
  47     Word-like strings contain at least one alphanumeric character.
  48     """
  49
  50     # Explicit loop is faster than:
  51     #return any(map(type(string).isalnum, string))
  52
  53     for c in string:
  54         if c.isalnum():
  55             return True
  56     else:
  57         return False
  58
  59 def numeric(orig, invalid=float('inf')):
  60     if not orig:
  61         return (invalid, '')
  62
  63     string = unicode(orig)
  64     for c in string:
  65         if c.isnumeric():
  66             break
  67     else:
  68         return (invalid, orig)
  69
  70     mult = 1
  71     while string[:1] == u"-" or string[:1] == u"+":
  72         if string[:1] == u"-":
  73             mult = -mult
  74         string = string[1:]
  75
  76     if not string[:1].isnumeric():
  77         return (invalid, orig)
  78
  79     # Early out if possible.
  80     try:
  81         return (float(string) * mult, orig)
  82     except ValueError:
  83         pass
  84
  85     # Otherwise we need to do this the hard way.
  86     string = normalize_punc(string)
  87
  88     def _numeric(string):
  89         total = 0
  90         for c in string:
  91             v = unicodedata.numeric(c)
  92             if v >= 1 or v == 0:
  93                 total *= 10
  94             total += v
  95         return total
  96
  97     try:
  98         whole, frac = string.split(".")
  99         whole = _numeric(whole)
 100         frac = _numeric(frac) / (10.0 ** len(frac))
 101         return (mult * (whole + frac), orig)
 102     except ValueError:
 103         return (mult * _numeric(string), orig)
 104
 105 def normalize_punc(string):
 106     string = unicode(string.strip(u",.'"))
 107     string = filter(lambda u: u.isnumeric() or u in u",.'", string)
 108     commas = string.count(u",")
 109     stops = string.count(u".")
 110     quotes = string.count(u"'")
 111
 112     # If anything occurs more than once, it's a separator.
 113     if commas > 1:
 114         string = string.replace(u",", u"")
 115         commas = 0
 116     if stops > 1:
 117         string = string.replace(u".", u"")
 118         stops = 0
 119     if quotes > 1:
 120         string = string.replace(u"'", u"")
 121         quotes = 0
 122
 123     def normalize_two(a, b, string):
 124         # One of each - assume the first is grouping, second is point.
 125         a_idx = string.rindex(a)
 126         b_idx = string.rindex(b)
 127         if a_idx > b_idx:
 128             string = string.replace(b, u"").replace(a, u".")
 129         else:
 130             string = string.replace(a, u"").replace(b, u".")
 131         return string
 132
 133     if commas and stops and quotes:
 134         # If all three, assume the middle is the decimal point.
 135         # A,AAA.BB'CC
 136         # A.AAA,BB'CC
 137         # A,AAA'BB.CC
 138         # A.AAA'BB,CC
 139         # Not really valid, so do whatever we want...
 140         # A'AAA.BB,CC
 141         # A'AAA,BB.CC
 142         comma_idx = string.index(u",")
 143         stops_idx = string.index(u".")
 144         quotes_idx = string.index(u"'")
 145         if (comma_idx < stops_idx < quotes_idx
 146             or quotes_idx < stops_idx < comma_idx):
 147             string = string.replace(u",", u"").replace(u"'", u"")
 148         elif (comma_idx < quotes_idx < stops_idx
 149             or stops_idx < quotes_idx < comma_idx):
 150             string = string.replace(
 151                 u",", u"").replace(
 152                 u".", u"").replace(
 153                 u"'", u".")
 154         else:
 155             string = string.replace(
 156                 u"'", u"").replace(
 157                 u".", u"").replace(
 158                 u",", u".")
 159
 160     elif stops and quotes:
 161         string = normalize_two(u".", u"'", string)
 162
 163     elif commas and quotes:
 164         string = normalize_two(u",", u"'", string)
 165
 166     elif commas and stops:
 167         string = normalize_two(u",", u".", string)
 168
 169     elif commas:
 170         if string[-4:-3] == u"," and len(string) <= 7:
 171             # Single comma as a thousands separator.
 172             string = string.replace(u",", u"")
 173         else:
 174             # Single comma, not thousands - probably a decimal point.
 175             string = string.replace(u",", u".")
 176
 177     elif quotes:
 178         # Single quote, probably MM'SS", equivalent to a decimal point.
 179         string = string.replace(u"'", u".")
 180
 181     return string