3 def alnumsplit(string
):
4 string
= unicode(string
)
15 elif numeric
and char
.isalpha():
17 elif not numeric
and char
.isnumeric():
21 strings
.append(u
"".join(word
))
26 strings
.append(u
"".join(word
))
30 """Check if a string is 'word-like'.
32 Word-like strings contain at least one alphanumeric character.
35 # Explicit loop is faster than:
36 #return any(map(type(string).isalnum, string))
44 def numeric(orig
, invalid
=float('inf')):
47 string
= unicode(orig
)
52 return (invalid
, orig
)
55 while string
[:1] == u
"-" or string
[:1] == u
"+":
56 if string
[:1] == u
"-":
60 # Early out if possible.
62 return (float(string
) * mult
, orig
)
66 # Otherwise we need to do this the hard way.
67 string
= normalize_punc(string
)
72 v
= unicodedata
.numeric(c
)
79 whole
, frac
= string
.split(".")
80 whole
= _numeric(whole
)
81 frac
= _numeric(frac
) / (10.0 ** len(frac
))
82 return (mult
* (whole
+ frac
), orig
)
84 return (mult
* _numeric(string
), orig
)
86 def normalize_punc(string
):
87 string
= unicode(string
.strip(u
",.'"))
88 string
= filter(lambda u
: u
.isnumeric() or u
in u
",.'", string
)
89 commas
= string
.count(u
",")
90 stops
= string
.count(u
".")
91 quotes
= string
.count(u
"'")
93 # If anything occurs more than once, it's a separator.
95 string
= string
.replace(u
",", u
"")
98 string
= string
.replace(u
".", u
"")
101 string
= string
.replace(u
"'", u
"")
104 def normalize_two(a
, b
, string
):
105 # One of each - assume the first is grouping, second is point.
106 a_idx
= string
.rindex(a
)
107 b_idx
= string
.rindex(b
)
109 string
= string
.replace(b
, u
"").replace(a
, u
".")
111 string
= string
.replace(a
, u
"").replace(b
, u
".")
114 if commas
and stops
and quotes
:
115 # If all three, assume the middle is the decimal point.
120 # Not really valid, so do whatever we want...
123 comma_idx
= string
.index(u
",")
124 stops_idx
= string
.index(u
".")
125 quotes_idx
= string
.index(u
"'")
126 if (comma_idx
< stops_idx
< quotes_idx
127 or quotes_idx
< stops_idx
< comma_idx
):
128 string
= string
.replace(u
",", u
"").replace(u
"'", u
"")
129 elif (comma_idx
< quotes_idx
< stops_idx
130 or stops_idx
< quotes_idx
< comma_idx
):
131 string
= string
.replace(
136 string
= string
.replace(
141 elif stops
and quotes
:
142 string
= normalize_two(u
".", u
"'", string
)
144 elif commas
and quotes
:
145 string
= normalize_two(u
",", u
"'", string
)
147 elif commas
and stops
:
148 string
= normalize_two(u
",", u
".", string
)
151 if string
[-4:-3] == u
"," and len(string
) <= 7:
152 # Single comma as a thousands separator.
153 string
= string
.replace(u
",", u
"")
155 # Single comma, not thousands - probably a decimal point.
156 string
= string
.replace(u
",", u
".")
159 # Single quote, probably MM'SS", equivalent to a decimal point.
160 string
= string
.replace(u
"'", u
".")