words = []
for sorteme in self.sortemes(string):
num, alpha = collate._strings.numeric(sorteme, invalid)
- words.append((num, self.key(alpha)))
+ alpha = self.key(collate._strings.strip_punc(alpha))
+ words.append((num, alpha))
return words
import unicodedata
-def strip_nonalnum(string):
- while string and not (string[0].isalpha() or string[0].isnumeric()):
+def strip_punc(string):
+ return filter(lambda c: unicodedata.category(c)[0] not in "PS", string)
+
+def strip_ends(string):
+ while string and unicodedata.category(string[0])[0] in "ZPS":
string = string[1:]
- while string and not (string[-1].isalpha() or string[-1].isnumeric()):
+ while string and unicodedata.category(string[-1])[0] in "ZPS":
string = string[:-1]
return string
numeric = None
start = 0
for i, char in enumerate(string):
+ category = unicodedata.category(char)
if numeric is None:
broke = False
if char.isnumeric():
elif numeric and char.isalpha():
broke = True
numeric = False
- elif numeric and char.isspace():
+ elif numeric and category in ["Zs", "Ps", "Pe"]:
broke = True
numeric = None
elif not numeric and char.isnumeric():
broke = True
numeric = True
if broke:
- strings.append(strip_nonalnum(string[start:i]))
+ strings.append(strip_ends(string[start:i]))
start = i
broke = False
- strings.append(strip_nonalnum(string[start:i + 1]))
+ strings.append(strip_ends(string[start:i + 1]))
return strings
def wordlike(string):
total = 0
for c in string:
v = unicodedata.numeric(c)
- if v >= 1:
+ if v >= 1 or v == 0:
total *= 10
total += v
return total