X-Git-Url: https://git.korewanetadesu.com/?p=python-collate.git;a=blobdiff_plain;f=collate%2F_strings.py;h=d872ed4a1bca483944cf6b3a4a86b870562dd314;hp=fd18bb917008dcb56aa06246dc86d1a721fe601c;hb=7c67e10286c784b572703666a980e85b39b858ee;hpb=53e1676b8d68cccd2b0692654d3871e44e0ba6b6 diff --git a/collate/_strings.py b/collate/_strings.py index fd18bb9..d872ed4 100644 --- a/collate/_strings.py +++ b/collate/_strings.py @@ -1,11 +1,24 @@ import unicodedata +def strip_punc(string): + return filter(lambda c: unicodedata.category(c)[0] not in "PS", string) + +def strip_ends(string): + while string and unicodedata.category(string[0])[0] in "ZPS": + string = string[1:] + while string and unicodedata.category(string[-1])[0] in "ZPS": + string = string[:-1] + return string + def alnumsplit(string): + if not string: + return [] string = unicode(string) strings = [] - word = [] numeric = None - for char in string: + start = 0 + for i, char in enumerate(string): + category = unicodedata.category(char) if numeric is None: broke = False if char.isnumeric(): @@ -14,16 +27,18 @@ def alnumsplit(string): numeric = False elif numeric and char.isalpha(): broke = True + numeric = False + elif numeric and category in ["Zs", "Ps", "Pe"]: + broke = True + numeric = None elif not numeric and char.isnumeric(): broke = True + numeric = True if broke: - if word: - strings.append(u"".join(word)) - word = [] - numeric = None - word.append(char) - if word: - strings.append(u"".join(word)) + strings.append(strip_ends(string[start:i])) + start = i + broke = False + strings.append(strip_ends(string[start:i + 1])) return strings def wordlike(string): @@ -44,6 +59,7 @@ def wordlike(string): def numeric(orig, invalid=float('inf')): if not orig: return (invalid, '') + string = unicode(orig) for c in string: if c.isnumeric(): @@ -57,6 +73,9 @@ def numeric(orig, invalid=float('inf')): mult = -mult string = string[1:] + if not string[:1].isnumeric(): + return (invalid, orig) + # Early out if possible. try: return (float(string) * mult, orig) @@ -70,7 +89,7 @@ def numeric(orig, invalid=float('inf')): total = 0 for c in string: v = unicodedata.numeric(c) - if v >= 1: + if v >= 1 or v == 0: total *= 10 total += v return total