From 91fd1e4a0bb531462bc443c21001376411ff862d Mon Sep 17 00:00:00 2001 From: Joe Wreschnig Date: Sun, 21 Feb 2010 20:28:46 -0800 Subject: [PATCH] Different algorithm, gives better results for numbers following grouping numbres but worse results for numbers following other punctuation. --- collate/strings.py | 124 +++++++++++++++----------- tests/en/numbersafternothing.list.txt | 6 ++ 2 files changed, 78 insertions(+), 52 deletions(-) create mode 100644 tests/en/numbersafternothing.list.txt diff --git a/collate/strings.py b/collate/strings.py index 267c6e5..5717246 100644 --- a/collate/strings.py +++ b/collate/strings.py @@ -10,7 +10,7 @@ CONTINUE_ON = frozenset([ UNKNOWN, LETTER, NUMBER = range(3) -BREAKER = u"\u2029" +BREAKER = u"\u2029" # Paragraph break character def sortemes(string): """Generate a list of sortemes for the string. @@ -26,77 +26,97 @@ def sortemes(string): """ words = [] + letters = [] + digits = [] if not string: return words string = unicode(string) - start = None - last = None - mode = UNKNOWN - previous_mode = UNKNOWN - category = "XX" + categories = map(unicodedata.category, string) + previous = UNKNOWN + types = [] + + def stripends(word): + while word and unicodedata.category(word[0])[0] in "PS": + word = word[1:] + while word and unicodedata.category(word[-1])[0] in "PS": + word = word[:-1] + return word # TODO(jfw): This kind of evolved over time, there's probably a much # faster / more concise way to express it now. - for i, c in enumerate(string): - broke = False - prev_category = category - this_mode = mode - category = unicodedata.category(c) + for i, (c, category) in enumerate(zip(string, categories)): + + if letters and previous == LETTER and words: + word = stripends(words.pop().strip()) + letters = list(stripends(word).strip() + BREAKER) + letters + previous = UNKNOWN # Split at the first letter following a number or # non-continuing character. if category[0] == "L": - if mode != LETTER: - broke = True - mode = LETTER + letters.append(c) + if digits: + words.append(u"".join(digits).strip()) + previous = NUMBER + digits = [] # Split at the first number following a non-number or # non-continuing character. elif category[0] == "N": - if mode != NUMBER: - broke = True - mode = NUMBER - - # Split if we find a non-continuing character ("weird" ones). - elif category not in CONTINUE_ON: - broke = True - mode = UNKNOWN + digits.append(c) + if letters: + words.append(u"".join(letters)) + previous = LETTER + letters = [] # Only certain punctuation allowed in numbers. - elif mode == NUMBER and category[0] == "P" and c not in "',._": - broke = True - mode = UNKNOWN + elif digits and c not in "',._": + words.append(u"".join(digits)) + previous = NUMBER + digits = [] + + # Split if we find a non-continuing character ("weird" ones). + elif letters and category not in CONTINUE_ON: + if letters: + words.append(u"".join(letters).strip() + BREAKER) + previous = LETTER + letters = [] + if digits: + words.append(u"".join(digits).strip() + BREAKER) + previous = NUMBER + digits = [] # Split if we find two pieces of punctuation in a row, even # if we should otherwise continue. - elif prev_category[0] in "P" and category[0] in "P": - broke = True - mode = UNKNOWN - - if broke and start is not None and last is not None: - # If we read two strings separated by weird punctuation, - # pretend the punctuation isn't there. - if this_mode == previous_mode == LETTER: - words[-1] += BREAKER + string[start:last+1] - else: - if this_mode == NUMBER and previous_mode == LETTER: - words[-1] += BREAKER - words.append(string[start:last+1]) - previous_mode = this_mode - - if broke: - start = i - last = None - if category[0] in "LN": - last = i - this_mode = mode - if start is not None and last is not None: - if this_mode == LETTER and previous_mode == LETTER and words: - words[-1] += BREAKER + string[start:last+1] + elif i and categories[i-1][0] in "P" and category[0] in "P": + if letters: + words.append(u"".join(letters)) + previous = LETTER + letters = [] + if digits: + words.append(u"".join(digits)) + previous = NUMBER + digits = [] + else: - if this_mode == NUMBER and previous_mode == LETTER and words: - words[-1] += BREAKER - words.append(string[start:last+1]) + if digits: + digits.append(c) + elif letters: + letters.append(c) + + if letters and previous == LETTER and words: + word = stripends(words.pop().strip()) + letters = list(stripends(word).strip() + BREAKER) + letters + previous = UNKNOWN + + if letters: + words.append(u"".join(letters)) + letters = [] + if digits: + words.append(u"".join(digits)) + digits = [] + + words = map(stripends, words) return words def numeric(orig, invalid=float('inf')): diff --git a/tests/en/numbersafternothing.list.txt b/tests/en/numbersafternothing.list.txt new file mode 100644 index 0000000..fc38492 --- /dev/null +++ b/tests/en/numbersafternothing.list.txt @@ -0,0 +1,6 @@ +Promised land +Promised Land +Promised Land (Loren & Mash Studio Live) +Promised Land (Reprise) +Promised land 2005 +Promised land 2005 (TETSU P'UNK vocalless version) -- 2.20.1