6bcfd9c5393c824e12edf84257c92866b9c48555
[python-collate.git] / collate / _strings.py
1 def alnumsplit(string):
2 string = unicode(string)
3 strings = []
4 word = []
5 numeric = None
6 for char in string:
7 if numeric is None:
8 broke = False
9 if char.isnumeric():
10 numeric = True
11 elif char.isalpha():
12 numeric = False
13 elif numeric and char.isalpha():
14 broke = True
15 elif not numeric and char.isnumeric():
16 broke = True
17 if broke:
18 if word:
19 strings.append("".join(word))
20 word = []
21 numeric = None
22 word.append(char)
23 if word:
24 strings.append("".join(word))
25 return strings
26
27 def wordlike(string):
28 """Check if a string is 'word-like'.
29
30 Word-like strings contain at least one alphanumeric character.
31 """
32 return any(map(type(string).isalnum, string))
33
34 def numeric(string, invalid=float('inf')):
35 string = unicode(string)
36 if not any(map(type(string).isnumeric, string)):
37 return (invalid, string)
38 if not string:
39 return (invalid, '')
40
41 mult = 1
42 while string[:1] == "-" or string[:1] == "+":
43 if string[0] == "-":
44 mult = -mult
45 string = string[1:]
46
47 # Maybe we got lucky and this is a trivial case...
48 try:
49 return float(string) * mult
50 except ValueError:
51 pass
52
53 # Otherwise we need to do this the hard way.
54 return mult * float(normalize_dots(string))
55
56 def normalize_dots(string):
57 string = unicode(string.strip(",.'"))
58 string = filter(lambda u: u.isnumeric() or u in ",.'", string)
59 commas = string.count(",")
60 stops = string.count(".")
61 quotes = string.count("'")
62
63 # If anything occurs more than once, it's a separator.
64 if commas > 1:
65 string = string.replace(",", "")
66 commas = 0
67 if stops > 1:
68 string = string.replace(".", "")
69 stops = 0
70 if quotes > 1:
71 string = string.replace("'", "")
72 quotes = 0
73
74 def normalize_two(a, b):
75 # One of each - assume the first is grouping, second is point.
76 a_idx = string.rindex(a)
77 b_idx = string.rindex(b)
78 if a_idx > b_idx:
79 string = string.replace(b, "").replace(a, ".")
80 else:
81 string = string.replace(a, "").replace(b, ".")
82 return string
83
84 if commas and stops and quotes:
85 # If all three, assume the middle is the decimal point.
86 # A,AAA.BB'CC
87 # A.AAA,BB'CC
88 # A,AAA'BB.CC
89 # A.AAA'BB,CC
90 # Not really valid, so do whatever we want...
91 # A'AAA.BB,CC
92 # A'AAA,BB.CC
93 comma_idx = string.index(",")
94 stops_idx = string.index(".")
95 quotes_idx = string.index("'")
96 if (comma_idx < stops_idx < quotes_idx
97 or quotes_idx < stops_idx < comma_idx):
98 string = string.replace(",", "").replace("'", "")
99 elif (comma_idx < quotes_idx < stops_idx
100 or stops_idx < quotes_idx < comma_idx):
101 string = string.replace(",", "").replace(".", "").replace("'", ".")
102 else:
103 string = string.replace("'", "").replace(".", "").replace(",", ".")
104
105 elif stops and quotes:
106 string = normalize_two('.', "'")
107
108 elif commas and quotes:
109 string = normalize_two(',', "'")
110
111 elif commas and stops:
112 string = normalize_two(',', '.')
113
114 elif commas:
115 if string[-4:-3] == "," and len(string) <= 7:
116 # Single comma as a thousands separator.
117 string = string.replace(",", "")
118 else:
119 # Single comma, not thousands - probably a decimal point.
120 string = string.replace(",", ".")
121
122 elif quotes:
123 # Single quote, probably MM'SS", equivalent to a decimal point.
124 string = string.replace("'", ".")
125
126 return string