3 def strip_punc(string
):
4 return filter(lambda c
: unicodedata
.category(c
)[0] not in "PS", string
)
6 def strip_ends(string
):
7 while string
and unicodedata
.category(string
[0])[0] in "ZPS":
9 while string
and unicodedata
.category(string
[-1])[0] in "ZPS":
13 def alnumsplit(string
):
16 string
= unicode(string
)
20 for i
, char
in enumerate(string
):
21 category
= unicodedata
.category(char
)
28 elif numeric
and char
.isalpha():
31 elif numeric
and category
in ["Zs", "Ps", "Pe"]:
34 elif not numeric
and char
.isnumeric():
38 strings
.append(strip_ends(string
[start
:i
]))
41 strings
.append(strip_ends(string
[start
:i
+ 1]))
45 """Check if a string is 'word-like'.
47 Word-like strings contain at least one alphanumeric character.
50 # Explicit loop is faster than:
51 #return any(map(type(string).isalnum, string))
59 def numeric(orig
, invalid
=float('inf')):
63 string
= unicode(orig
)
68 return (invalid
, orig
)
71 while string
[:1] == u
"-" or string
[:1] == u
"+":
72 if string
[:1] == u
"-":
76 if not string
[:1].isnumeric():
77 return (invalid
, orig
)
79 # Early out if possible.
81 return (float(string
) * mult
, orig
)
85 # Otherwise we need to do this the hard way.
86 string
= normalize_punc(string
)
91 v
= unicodedata
.numeric(c
)
98 whole
, frac
= string
.split(".")
99 whole
= _numeric(whole
)
100 frac
= _numeric(frac
) / (10.0 ** len(frac
))
101 return (mult
* (whole
+ frac
), orig
)
103 return (mult
* _numeric(string
), orig
)
105 def normalize_punc(string
):
106 string
= unicode(string
.strip(u
",.'"))
107 string
= filter(lambda u
: u
.isnumeric() or u
in u
",.'", string
)
108 commas
= string
.count(u
",")
109 stops
= string
.count(u
".")
110 quotes
= string
.count(u
"'")
112 # If anything occurs more than once, it's a separator.
114 string
= string
.replace(u
",", u
"")
117 string
= string
.replace(u
".", u
"")
120 string
= string
.replace(u
"'", u
"")
123 def normalize_two(a
, b
, string
):
124 # One of each - assume the first is grouping, second is point.
125 a_idx
= string
.rindex(a
)
126 b_idx
= string
.rindex(b
)
128 string
= string
.replace(b
, u
"").replace(a
, u
".")
130 string
= string
.replace(a
, u
"").replace(b
, u
".")
133 if commas
and stops
and quotes
:
134 # If all three, assume the middle is the decimal point.
139 # Not really valid, so do whatever we want...
142 comma_idx
= string
.index(u
",")
143 stops_idx
= string
.index(u
".")
144 quotes_idx
= string
.index(u
"'")
145 if (comma_idx
< stops_idx
< quotes_idx
146 or quotes_idx
< stops_idx
< comma_idx
):
147 string
= string
.replace(u
",", u
"").replace(u
"'", u
"")
148 elif (comma_idx
< quotes_idx
< stops_idx
149 or stops_idx
< quotes_idx
< comma_idx
):
150 string
= string
.replace(
155 string
= string
.replace(
160 elif stops
and quotes
:
161 string
= normalize_two(u
".", u
"'", string
)
163 elif commas
and quotes
:
164 string
= normalize_two(u
",", u
"'", string
)
166 elif commas
and stops
:
167 string
= normalize_two(u
",", u
".", string
)
170 if string
[-4:-3] == u
"," and len(string
) <= 7:
171 # Single comma as a thousands separator.
172 string
= string
.replace(u
",", u
"")
174 # Single comma, not thousands - probably a decimal point.
175 string
= string
.replace(u
",", u
".")
178 # Single quote, probably MM'SS", equivalent to a decimal point.
179 string
= string
.replace(u
"'", u
".")