X-Git-Url: https://git.korewanetadesu.com/?p=python-collate.git;a=blobdiff_plain;f=collate%2F_strings.py;h=d872ed4a1bca483944cf6b3a4a86b870562dd314;hp=f81bfd7e20be5bc65529f85e3bccab5fb0f15d60;hb=7c67e10286c784b572703666a980e85b39b858ee;hpb=2a37219e2d9c0fe58e78d987a21f6e37cfd33940 diff --git a/collate/_strings.py b/collate/_strings.py index f81bfd7..d872ed4 100644 --- a/collate/_strings.py +++ b/collate/_strings.py @@ -1,9 +1,12 @@ import unicodedata -def strip_nonalnum(string): - while string and not (string[0].isalpha() or string[0].isnumeric()): +def strip_punc(string): + return filter(lambda c: unicodedata.category(c)[0] not in "PS", string) + +def strip_ends(string): + while string and unicodedata.category(string[0])[0] in "ZPS": string = string[1:] - while string and not (string[-1].isalpha() or string[-1].isnumeric()): + while string and unicodedata.category(string[-1])[0] in "ZPS": string = string[:-1] return string @@ -15,6 +18,7 @@ def alnumsplit(string): numeric = None start = 0 for i, char in enumerate(string): + category = unicodedata.category(char) if numeric is None: broke = False if char.isnumeric(): @@ -24,17 +28,17 @@ def alnumsplit(string): elif numeric and char.isalpha(): broke = True numeric = False - elif numeric and char.isspace(): + elif numeric and category in ["Zs", "Ps", "Pe"]: broke = True numeric = None elif not numeric and char.isnumeric(): broke = True numeric = True if broke: - strings.append(strip_nonalnum(string[start:i])) + strings.append(strip_ends(string[start:i])) start = i broke = False - strings.append(strip_nonalnum(string[start:i + 1])) + strings.append(strip_ends(string[start:i + 1])) return strings def wordlike(string): @@ -85,7 +89,7 @@ def numeric(orig, invalid=float('inf')): total = 0 for c in string: v = unicodedata.numeric(c) - if v >= 1: + if v >= 1 or v == 0: total *= 10 total += v return total