3 def strip_nonalnum(string
):
4 while string
and not (string
[0].isalpha() or string
[0].isnumeric()):
6 while string
and not (string
[-1].isalpha() or string
[-1].isnumeric()):
10 def alnumsplit(string
):
13 string
= unicode(string
)
17 for i
, char
in enumerate(string
):
24 elif numeric
and char
.isalpha():
27 elif numeric
and char
.isspace():
30 elif not numeric
and char
.isnumeric():
34 strings
.append(strip_nonalnum(string
[start
:i
]))
37 strings
.append(strip_nonalnum(string
[start
:i
+ 1]))
41 """Check if a string is 'word-like'.
43 Word-like strings contain at least one alphanumeric character.
46 # Explicit loop is faster than:
47 #return any(map(type(string).isalnum, string))
55 def numeric(orig
, invalid
=float('inf')):
59 string
= unicode(orig
)
64 return (invalid
, orig
)
67 while string
[:1] == u
"-" or string
[:1] == u
"+":
68 if string
[:1] == u
"-":
72 if not string
[:1].isnumeric():
73 return (invalid
, orig
)
75 # Early out if possible.
77 return (float(string
) * mult
, orig
)
81 # Otherwise we need to do this the hard way.
82 string
= normalize_punc(string
)
87 v
= unicodedata
.numeric(c
)
94 whole
, frac
= string
.split(".")
95 whole
= _numeric(whole
)
96 frac
= _numeric(frac
) / (10.0 ** len(frac
))
97 return (mult
* (whole
+ frac
), orig
)
99 return (mult
* _numeric(string
), orig
)
101 def normalize_punc(string
):
102 string
= unicode(string
.strip(u
",.'"))
103 string
= filter(lambda u
: u
.isnumeric() or u
in u
",.'", string
)
104 commas
= string
.count(u
",")
105 stops
= string
.count(u
".")
106 quotes
= string
.count(u
"'")
108 # If anything occurs more than once, it's a separator.
110 string
= string
.replace(u
",", u
"")
113 string
= string
.replace(u
".", u
"")
116 string
= string
.replace(u
"'", u
"")
119 def normalize_two(a
, b
, string
):
120 # One of each - assume the first is grouping, second is point.
121 a_idx
= string
.rindex(a
)
122 b_idx
= string
.rindex(b
)
124 string
= string
.replace(b
, u
"").replace(a
, u
".")
126 string
= string
.replace(a
, u
"").replace(b
, u
".")
129 if commas
and stops
and quotes
:
130 # If all three, assume the middle is the decimal point.
135 # Not really valid, so do whatever we want...
138 comma_idx
= string
.index(u
",")
139 stops_idx
= string
.index(u
".")
140 quotes_idx
= string
.index(u
"'")
141 if (comma_idx
< stops_idx
< quotes_idx
142 or quotes_idx
< stops_idx
< comma_idx
):
143 string
= string
.replace(u
",", u
"").replace(u
"'", u
"")
144 elif (comma_idx
< quotes_idx
< stops_idx
145 or stops_idx
< quotes_idx
< comma_idx
):
146 string
= string
.replace(
151 string
= string
.replace(
156 elif stops
and quotes
:
157 string
= normalize_two(u
".", u
"'", string
)
159 elif commas
and quotes
:
160 string
= normalize_two(u
",", u
"'", string
)
162 elif commas
and stops
:
163 string
= normalize_two(u
",", u
".", string
)
166 if string
[-4:-3] == u
"," and len(string
) <= 7:
167 # Single comma as a thousands separator.
168 string
= string
.replace(u
",", u
"")
170 # Single comma, not thousands - probably a decimal point.
171 string
= string
.replace(u
",", u
".")
174 # Single quote, probably MM'SS", equivalent to a decimal point.
175 string
= string
.replace(u
"'", u
".")