From: Joe Wreschnig Date: Wed, 17 Feb 2010 10:04:15 +0000 (-0800) Subject: New approach - find split points based on Unicode categories. X-Git-Url: https://git.korewanetadesu.com/?p=python-collate.git;a=commitdiff_plain;h=7dc546d67cd996c0a54b07261ab1b7bcdf4019ce New approach - find split points based on Unicode categories. --- diff --git a/collate/_abcollator.py b/collate/_abcollator.py index 2e6ab95..fdd7783 100644 --- a/collate/_abcollator.py +++ b/collate/_abcollator.py @@ -5,20 +5,14 @@ class Collator(object): """Return negative if a < b, zero if a == b, positive if a > b.""" return cmp(self.key(string1), self.key(string2)) - def words(self, string): - """Split the string into separate words. - - This split is done using Unicode's definition of whitespace. - """ - return string.split() - - def sortemes(self, string): - return collate._strings.alnumsplit(string) - def sortemekey(self, string, invalid=float('inf')): - words = [] - for sorteme in self.sortemes(string): + keys = [] + for sorteme in collate._strings.sortemes(string): num, alpha = collate._strings.numeric(sorteme, invalid) - alpha = self.key(collate._strings.strip_punc(alpha)) - words.append((num, alpha)) - return words + if num == invalid: + keys.append(self.key(alpha)) + else: + keys.append(num) + # Shove the sortkeyed original string on the end to resolve + # ties intelligently. + return (keys, self.key(string)) diff --git a/collate/_strings.py b/collate/_strings.py index d872ed4..dde1739 100644 --- a/collate/_strings.py +++ b/collate/_strings.py @@ -1,60 +1,101 @@ import unicodedata -def strip_punc(string): - return filter(lambda c: unicodedata.category(c)[0] not in "PS", string) - -def strip_ends(string): - while string and unicodedata.category(string[0])[0] in "ZPS": - string = string[1:] - while string and unicodedata.category(string[-1])[0] in "ZPS": - string = string[:-1] - return string +CONTINUE_ON = frozenset([ + "Ll", "Lm", "Lo", "Lt", "Lu", + "Mc", "Me", "Mn", + "Nd", "Nl", "No", + "Po", + "Zs", + ]) + +UNKNOWN, LETTER, NUMBER = range(3) + +def sortemes(string): + """Generate a list of sortemes for the string. + + A sorteme, by analogy with grapheme/morpheme/etc. is an atom of + sort information. This is larger than a word boundry but smaller + than a sentence boundry; roughly, a sorteme boundry occurs between + letters and numbers, between numbers and numbrs if 'too much' + punctuation exists in between, between lines. + + There is no formal specification for sortemes; the goal of this + function is to provide good output for Collator.sortemekey. + """ -def alnumsplit(string): + words = [] if not string: - return [] + return words string = unicode(string) - strings = [] - numeric = None - start = 0 - for i, char in enumerate(string): - category = unicodedata.category(char) - if numeric is None: - broke = False - if char.isnumeric(): - numeric = True - elif char.isalpha(): - numeric = False - elif numeric and char.isalpha(): + start = None + last = None + mode = UNKNOWN + previous_mode = UNKNOWN + category = "XX" + for i, c in enumerate(string): + broke = False + prev_category = category + this_mode = mode + category = unicodedata.category(c) + + # Split at the first letter following a number or + # non-continuing character. + if category[0] == "L": + if mode != LETTER: + broke = True + mode = LETTER + + # Split at the first number following a non-number or + # non-continuing character. + elif category[0] == "N": + if mode != NUMBER: + broke = True + mode = NUMBER + + # Split if we find a non-continuing character ("weird" ones). + elif category not in CONTINUE_ON: broke = True - numeric = False - elif numeric and category in ["Zs", "Ps", "Pe"]: + mode = UNKNOWN + + # Only certain punctuation allowed in numbers. + elif mode == NUMBER and category[0] == "P" and c not in "',._": broke = True - numeric = None - elif not numeric and char.isnumeric(): + mode = UNKNOWN + + # Split if we find two pieces of punctuation in a row, even + # if we should otherwise continue. + elif i > 0 and prev_category[0] == "P" and category[0] == "P": broke = True - numeric = True + mode = UNKNOWN + + if broke and start is not None and last is not None: + # If we read two strings separated by weird punctuation, + # pretend the punctuation isn't there. + if (this_mode == previous_mode == LETTER + and prev_category[0] == "P" + and words): + words[-1] += u" " + string[start:last+1] + else: + # This ensures "foo2 bar" sorts as ["foo ", 2, "bar"] + # Which sorts after ["foo", "bar"]. + if this_mode == NUMBER and previous_mode == LETTER and words: + words[-1] += u" " + words.append(string[start:last+1]) + previous_mode = this_mode + if broke: - strings.append(strip_ends(string[start:i])) start = i - broke = False - strings.append(strip_ends(string[start:i + 1])) - return strings - -def wordlike(string): - """Check if a string is 'word-like'. - - Word-like strings contain at least one alphanumeric character. - """ - - # Explicit loop is faster than: - #return any(map(type(string).isalnum, string)) - - for c in string: - if c.isalnum(): - return True - else: - return False + last = None + if category[0] in "LN": + last = i + if start is not None and last is not None: + if this_mode == previous_mode == LETTER and words: + words[-1] += u" " + string[start:last+1] + else: + if this_mode == NUMBER and previous_mode == LETTER and words: + words[-1] += u" " + words.append(string[start:last+1]) + return words def numeric(orig, invalid=float('inf')): if not orig: @@ -76,6 +117,8 @@ def numeric(orig, invalid=float('inf')): if not string[:1].isnumeric(): return (invalid, orig) + string = normalize_punc(string) + # Early out if possible. try: return (float(string) * mult, orig) @@ -83,8 +126,6 @@ def numeric(orig, invalid=float('inf')): pass # Otherwise we need to do this the hard way. - string = normalize_punc(string) - def _numeric(string): total = 0 for c in string: @@ -178,4 +219,8 @@ def normalize_punc(string): # Single quote, probably MM'SS", equivalent to a decimal point. string = string.replace(u"'", u".") + elif stops and string[-4:] == ".000": + # Single stop, but no decimal - probably grouping. + string = string.replace(u".", u"") + return string diff --git a/collate/icu/__init__.py b/collate/icu/__init__.py index 6f9647e..892b8a1 100644 --- a/collate/icu/__init__.py +++ b/collate/icu/__init__.py @@ -36,11 +36,6 @@ class Collator(collate._abcollator.Collator): # so this is a harmless error. self._breaker = _icu.WordBreaker("root") - def words(self, string): - if isinstance(string, str): - string = string.decode(self.encoding, 'replace') - return filter(lambda u: not u.isspace(), self._breaker.words(string)) - def key(self, string): """Sort key for a string.