import unicodedata
+def strip_punc(string):
+ return filter(lambda c: unicodedata.category(c)[0] not in "PS", string)
+
+def strip_ends(string):
+ while string and unicodedata.category(string[0])[0] in "ZPS":
+ string = string[1:]
+ while string and unicodedata.category(string[-1])[0] in "ZPS":
+ string = string[:-1]
+ return string
+
def alnumsplit(string):
+ if not string:
+ return []
string = unicode(string)
strings = []
- word = []
numeric = None
- for char in string:
+ start = 0
+ for i, char in enumerate(string):
+ category = unicodedata.category(char)
if numeric is None:
broke = False
if char.isnumeric():
numeric = False
elif numeric and char.isalpha():
broke = True
+ numeric = False
+ elif numeric and category in ["Zs", "Ps", "Pe"]:
+ broke = True
+ numeric = None
elif not numeric and char.isnumeric():
broke = True
+ numeric = True
if broke:
- if word:
- strings.append(u"".join(word))
- word = []
- numeric = None
- word.append(char)
- if word:
- strings.append(u"".join(word))
+ strings.append(strip_ends(string[start:i]))
+ start = i
+ broke = False
+ strings.append(strip_ends(string[start:i + 1]))
return strings
def wordlike(string):
def numeric(orig, invalid=float('inf')):
if not orig:
return (invalid, '')
+
string = unicode(orig)
for c in string:
if c.isnumeric():
mult = -mult
string = string[1:]
+ if not string[:1].isnumeric():
+ return (invalid, orig)
+
# Early out if possible.
try:
return (float(string) * mult, orig)
total = 0
for c in string:
v = unicodedata.numeric(c)
- if v >= 1:
+ if v >= 1 or v == 0:
total *= 10
total += v
return total