collate/_strings.py

   1 import unicodedata
   2
   3 def strip_nonalnum(string):
   4     while string and not (string[0].isalpha() or string[0].isnumeric()):
   5         string = string[1:]
   6     while string and not (string[-1].isalpha() or string[-1].isnumeric()):
   7         string = string[:-1]
   8     return string
   9
  10 def alnumsplit(string):
  11     if not string:
  12         return []
  13     string = unicode(string)
  14     strings = []
  15     numeric = None
  16     start = 0
  17     for i, char in enumerate(string):
  18         if numeric is None:
  19             broke = False
  20             if char.isnumeric():
  21                 numeric = True
  22             elif char.isalpha():
  23                 numeric = False
  24         elif numeric and char.isalpha():
  25             broke = True
  26             numeric = False
  27         elif numeric and char.isspace():
  28             broke = True
  29             numeric = None
  30         elif not numeric and char.isnumeric():
  31             broke = True
  32             numeric = True
  33         if broke:
  34             strings.append(strip_nonalnum(string[start:i]))
  35             start = i
  36             broke = False
  37     strings.append(strip_nonalnum(string[start:i + 1]))
  38     return strings
  39
  40 def wordlike(string):
  41     """Check if a string is 'word-like'.
  42
  43     Word-like strings contain at least one alphanumeric character.
  44     """
  45
  46     # Explicit loop is faster than:
  47     #return any(map(type(string).isalnum, string))
  48
  49     for c in string:
  50         if c.isalnum():
  51             return True
  52     else:
  53         return False
  54
  55 def numeric(orig, invalid=float('inf')):
  56     if not orig:
  57         return (invalid, '')
  58
  59     string = unicode(orig)
  60     for c in string:
  61         if c.isnumeric():
  62             break
  63     else:
  64         return (invalid, orig)
  65
  66     mult = 1
  67     while string[:1] == u"-" or string[:1] == u"+":
  68         if string[:1] == u"-":
  69             mult = -mult
  70         string = string[1:]
  71
  72     if not string[:1].isnumeric():
  73         return (invalid, orig)
  74
  75     # Early out if possible.
  76     try:
  77         return (float(string) * mult, orig)
  78     except ValueError:
  79         pass
  80
  81     # Otherwise we need to do this the hard way.
  82     string = normalize_punc(string)
  83
  84     def _numeric(string):
  85         total = 0
  86         for c in string:
  87             v = unicodedata.numeric(c)
  88             if v >= 1:
  89                 total *= 10
  90             total += v
  91         return total
  92
  93     try:
  94         whole, frac = string.split(".")
  95         whole = _numeric(whole)
  96         frac = _numeric(frac) / (10.0 ** len(frac))
  97         return (mult * (whole + frac), orig)
  98     except ValueError:
  99         return (mult * _numeric(string), orig)
 100
 101 def normalize_punc(string):
 102     string = unicode(string.strip(u",.'"))
 103     string = filter(lambda u: u.isnumeric() or u in u",.'", string)
 104     commas = string.count(u",")
 105     stops = string.count(u".")
 106     quotes = string.count(u"'")
 107
 108     # If anything occurs more than once, it's a separator.
 109     if commas > 1:
 110         string = string.replace(u",", u"")
 111         commas = 0
 112     if stops > 1:
 113         string = string.replace(u".", u"")
 114         stops = 0
 115     if quotes > 1:
 116         string = string.replace(u"'", u"")
 117         quotes = 0
 118
 119     def normalize_two(a, b, string):
 120         # One of each - assume the first is grouping, second is point.
 121         a_idx = string.rindex(a)
 122         b_idx = string.rindex(b)
 123         if a_idx > b_idx:
 124             string = string.replace(b, u"").replace(a, u".")
 125         else:
 126             string = string.replace(a, u"").replace(b, u".")
 127         return string
 128
 129     if commas and stops and quotes:
 130         # If all three, assume the middle is the decimal point.
 131         # A,AAA.BB'CC
 132         # A.AAA,BB'CC
 133         # A,AAA'BB.CC
 134         # A.AAA'BB,CC
 135         # Not really valid, so do whatever we want...
 136         # A'AAA.BB,CC
 137         # A'AAA,BB.CC
 138         comma_idx = string.index(u",")
 139         stops_idx = string.index(u".")
 140         quotes_idx = string.index(u"'")
 141         if (comma_idx < stops_idx < quotes_idx
 142             or quotes_idx < stops_idx < comma_idx):
 143             string = string.replace(u",", u"").replace(u"'", u"")
 144         elif (comma_idx < quotes_idx < stops_idx
 145             or stops_idx < quotes_idx < comma_idx):
 146             string = string.replace(
 147                 u",", u"").replace(
 148                 u".", u"").replace(
 149                 u"'", u".")
 150         else:
 151             string = string.replace(
 152                 u"'", u"").replace(
 153                 u".", u"").replace(
 154                 u",", u".")
 155
 156     elif stops and quotes:
 157         string = normalize_two(u".", u"'", string)
 158
 159     elif commas and quotes:
 160         string = normalize_two(u",", u"'", string)
 161
 162     elif commas and stops:
 163         string = normalize_two(u",", u".", string)
 164
 165     elif commas:
 166         if string[-4:-3] == u"," and len(string) <= 7:
 167             # Single comma as a thousands separator.
 168             string = string.replace(u",", u"")
 169         else:
 170             # Single comma, not thousands - probably a decimal point.
 171             string = string.replace(u",", u".")
 172
 173     elif quotes:
 174         # Single quote, probably MM'SS", equivalent to a decimal point.
 175         string = string.replace(u"'", u".")
 176
 177     return string