From 7c67e10286c784b572703666a980e85b39b858ee Mon Sep 17 00:00:00 2001 From: Joe Wreschnig Date: Tue, 16 Feb 2010 01:28:22 -0800 Subject: [PATCH] Category-based splitting. --- collate/_abcollator.py | 3 ++- collate/_strings.py | 18 +++++++++++------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/collate/_abcollator.py b/collate/_abcollator.py index 02cb733..2e6ab95 100644 --- a/collate/_abcollator.py +++ b/collate/_abcollator.py @@ -19,5 +19,6 @@ class Collator(object): words = [] for sorteme in self.sortemes(string): num, alpha = collate._strings.numeric(sorteme, invalid) - words.append((num, self.key(alpha))) + alpha = self.key(collate._strings.strip_punc(alpha)) + words.append((num, alpha)) return words diff --git a/collate/_strings.py b/collate/_strings.py index f81bfd7..d872ed4 100644 --- a/collate/_strings.py +++ b/collate/_strings.py @@ -1,9 +1,12 @@ import unicodedata -def strip_nonalnum(string): - while string and not (string[0].isalpha() or string[0].isnumeric()): +def strip_punc(string): + return filter(lambda c: unicodedata.category(c)[0] not in "PS", string) + +def strip_ends(string): + while string and unicodedata.category(string[0])[0] in "ZPS": string = string[1:] - while string and not (string[-1].isalpha() or string[-1].isnumeric()): + while string and unicodedata.category(string[-1])[0] in "ZPS": string = string[:-1] return string @@ -15,6 +18,7 @@ def alnumsplit(string): numeric = None start = 0 for i, char in enumerate(string): + category = unicodedata.category(char) if numeric is None: broke = False if char.isnumeric(): @@ -24,17 +28,17 @@ def alnumsplit(string): elif numeric and char.isalpha(): broke = True numeric = False - elif numeric and char.isspace(): + elif numeric and category in ["Zs", "Ps", "Pe"]: broke = True numeric = None elif not numeric and char.isnumeric(): broke = True numeric = True if broke: - strings.append(strip_nonalnum(string[start:i])) + strings.append(strip_ends(string[start:i])) start = i broke = False - strings.append(strip_nonalnum(string[start:i + 1])) + strings.append(strip_ends(string[start:i + 1])) return strings def wordlike(string): @@ -85,7 +89,7 @@ def numeric(orig, invalid=float('inf')): total = 0 for c in string: v = unicodedata.numeric(c) - if v >= 1: + if v >= 1 or v == 0: total *= 10 total += v return total -- 2.20.1