collate/_strings.py

   1 def alnumsplit(string):
   2     string = unicode(string)
   3     strings = []
   4     word = []
   5     numeric = None
   6     for char in string:
   7         if numeric is None:
   8             broke = False
   9             if char.isnumeric():
  10                 numeric = True
  11             elif char.isalpha():
  12                 numeric = False
  13         elif numeric and char.isalpha():
  14             broke = True
  15         elif not numeric and char.isnumeric():
  16             broke = True
  17         if broke:
  18             if word:
  19                 strings.append("".join(word))
  20                 word = []
  21             numeric = None
  22         word.append(char)
  23     if word:
  24         strings.append("".join(word))
  25     return strings
  26
  27 def wordlike(string):
  28     """Check if a string is 'word-like'.
  29
  30     Word-like strings contain at least one alphanumeric character.
  31     """
  32     return any(map(type(string).isalnum, string))
  33
  34 def numeric(string, invalid=float('inf')):
  35     string = unicode(string)
  36     if not any(map(type(string).isnumeric, string)):
  37         return (invalid, string)
  38     if not string:
  39         return (invalid, '')
  40
  41     mult = 1
  42     while string[:1] == "-" or string[:1] == "+":
  43         if string[0] == "-":
  44             mult = -mult
  45         string = string[1:]
  46
  47     # Maybe we got lucky and this is a trivial case...
  48     try:
  49         return float(string) * mult
  50     except ValueError:
  51         pass
  52
  53     # Otherwise we need to do this the hard way.
  54     return mult * float(normalize_dots(string))
  55
  56 def normalize_dots(string):
  57     string = unicode(string.strip(",.'"))
  58     string = filter(lambda u: u.isnumeric() or u in ",.'", string)
  59     commas = string.count(",")
  60     stops = string.count(".")
  61     quotes = string.count("'")
  62
  63     # If anything occurs more than once, it's a separator.
  64     if commas > 1:
  65         string = string.replace(",", "")
  66         commas = 0
  67     if stops > 1:
  68         string = string.replace(".", "")
  69         stops = 0
  70     if quotes > 1:
  71         string = string.replace("'", "")
  72         quotes = 0
  73
  74     def normalize_two(a, b):
  75         # One of each - assume the first is grouping, second is point.
  76         a_idx = string.rindex(a)
  77         b_idx = string.rindex(b)
  78         if a_idx > b_idx:
  79             string = string.replace(b, "").replace(a, ".")
  80         else:
  81             string = string.replace(a, "").replace(b, ".")
  82         return string
  83
  84     if commas and stops and quotes:
  85         # If all three, assume the middle is the decimal point.
  86         # A,AAA.BB'CC
  87         # A.AAA,BB'CC
  88         # A,AAA'BB.CC
  89         # A.AAA'BB,CC
  90         # Not really valid, so do whatever we want...
  91         # A'AAA.BB,CC
  92         # A'AAA,BB.CC
  93         comma_idx = string.index(",")
  94         stops_idx = string.index(".")
  95         quotes_idx = string.index("'")
  96         if (comma_idx < stops_idx < quotes_idx
  97             or quotes_idx < stops_idx < comma_idx):
  98             string = string.replace(",", "").replace("'", "")
  99         elif (comma_idx < quotes_idx < stops_idx
 100             or stops_idx < quotes_idx < comma_idx):
 101             string = string.replace(",", "").replace(".", "").replace("'", ".")
 102         else:
 103             string = string.replace("'", "").replace(".", "").replace(",", ".")
 104
 105     elif stops and quotes:
 106         string = normalize_two('.', "'")
 107
 108     elif commas and quotes:
 109         string = normalize_two(',', "'")
 110
 111     elif commas and stops:
 112         string = normalize_two(',', '.')
 113
 114     elif commas:
 115         if string[-4:-3] == "," and len(string) <= 7:
 116             # Single comma as a thousands separator.
 117             string = string.replace(",", "")
 118         else:
 119             # Single comma, not thousands - probably a decimal point.
 120             string = string.replace(",", ".")
 121
 122     elif quotes:
 123         # Single quote, probably MM'SS", equivalent to a decimal point.
 124         string = string.replace("'", ".")
 125
 126     return string