f81bfd7e20be5bc65529f85e3bccab5fb0f15d60
[python-collate.git] / collate / _strings.py
1 import unicodedata
2
3 def strip_nonalnum(string):
4 while string and not (string[0].isalpha() or string[0].isnumeric()):
5 string = string[1:]
6 while string and not (string[-1].isalpha() or string[-1].isnumeric()):
7 string = string[:-1]
8 return string
9
10 def alnumsplit(string):
11 if not string:
12 return []
13 string = unicode(string)
14 strings = []
15 numeric = None
16 start = 0
17 for i, char in enumerate(string):
18 if numeric is None:
19 broke = False
20 if char.isnumeric():
21 numeric = True
22 elif char.isalpha():
23 numeric = False
24 elif numeric and char.isalpha():
25 broke = True
26 numeric = False
27 elif numeric and char.isspace():
28 broke = True
29 numeric = None
30 elif not numeric and char.isnumeric():
31 broke = True
32 numeric = True
33 if broke:
34 strings.append(strip_nonalnum(string[start:i]))
35 start = i
36 broke = False
37 strings.append(strip_nonalnum(string[start:i + 1]))
38 return strings
39
40 def wordlike(string):
41 """Check if a string is 'word-like'.
42
43 Word-like strings contain at least one alphanumeric character.
44 """
45
46 # Explicit loop is faster than:
47 #return any(map(type(string).isalnum, string))
48
49 for c in string:
50 if c.isalnum():
51 return True
52 else:
53 return False
54
55 def numeric(orig, invalid=float('inf')):
56 if not orig:
57 return (invalid, '')
58
59 string = unicode(orig)
60 for c in string:
61 if c.isnumeric():
62 break
63 else:
64 return (invalid, orig)
65
66 mult = 1
67 while string[:1] == u"-" or string[:1] == u"+":
68 if string[:1] == u"-":
69 mult = -mult
70 string = string[1:]
71
72 if not string[:1].isnumeric():
73 return (invalid, orig)
74
75 # Early out if possible.
76 try:
77 return (float(string) * mult, orig)
78 except ValueError:
79 pass
80
81 # Otherwise we need to do this the hard way.
82 string = normalize_punc(string)
83
84 def _numeric(string):
85 total = 0
86 for c in string:
87 v = unicodedata.numeric(c)
88 if v >= 1:
89 total *= 10
90 total += v
91 return total
92
93 try:
94 whole, frac = string.split(".")
95 whole = _numeric(whole)
96 frac = _numeric(frac) / (10.0 ** len(frac))
97 return (mult * (whole + frac), orig)
98 except ValueError:
99 return (mult * _numeric(string), orig)
100
101 def normalize_punc(string):
102 string = unicode(string.strip(u",.'"))
103 string = filter(lambda u: u.isnumeric() or u in u",.'", string)
104 commas = string.count(u",")
105 stops = string.count(u".")
106 quotes = string.count(u"'")
107
108 # If anything occurs more than once, it's a separator.
109 if commas > 1:
110 string = string.replace(u",", u"")
111 commas = 0
112 if stops > 1:
113 string = string.replace(u".", u"")
114 stops = 0
115 if quotes > 1:
116 string = string.replace(u"'", u"")
117 quotes = 0
118
119 def normalize_two(a, b, string):
120 # One of each - assume the first is grouping, second is point.
121 a_idx = string.rindex(a)
122 b_idx = string.rindex(b)
123 if a_idx > b_idx:
124 string = string.replace(b, u"").replace(a, u".")
125 else:
126 string = string.replace(a, u"").replace(b, u".")
127 return string
128
129 if commas and stops and quotes:
130 # If all three, assume the middle is the decimal point.
131 # A,AAA.BB'CC
132 # A.AAA,BB'CC
133 # A,AAA'BB.CC
134 # A.AAA'BB,CC
135 # Not really valid, so do whatever we want...
136 # A'AAA.BB,CC
137 # A'AAA,BB.CC
138 comma_idx = string.index(u",")
139 stops_idx = string.index(u".")
140 quotes_idx = string.index(u"'")
141 if (comma_idx < stops_idx < quotes_idx
142 or quotes_idx < stops_idx < comma_idx):
143 string = string.replace(u",", u"").replace(u"'", u"")
144 elif (comma_idx < quotes_idx < stops_idx
145 or stops_idx < quotes_idx < comma_idx):
146 string = string.replace(
147 u",", u"").replace(
148 u".", u"").replace(
149 u"'", u".")
150 else:
151 string = string.replace(
152 u"'", u"").replace(
153 u".", u"").replace(
154 u",", u".")
155
156 elif stops and quotes:
157 string = normalize_two(u".", u"'", string)
158
159 elif commas and quotes:
160 string = normalize_two(u",", u"'", string)
161
162 elif commas and stops:
163 string = normalize_two(u",", u".", string)
164
165 elif commas:
166 if string[-4:-3] == u"," and len(string) <= 7:
167 # Single comma as a thousands separator.
168 string = string.replace(u",", u"")
169 else:
170 # Single comma, not thousands - probably a decimal point.
171 string = string.replace(u",", u".")
172
173 elif quotes:
174 # Single quote, probably MM'SS", equivalent to a decimal point.
175 string = string.replace(u"'", u".")
176
177 return string