X-Git-Url: https://git.korewanetadesu.com/?p=python-collate.git;a=blobdiff_plain;f=collate%2Fstrings.py;h=5badc8c66ae9826dc8335a9c8d00e129610caaff;hp=bc4ed629aec5a9b8be46d2da1e9779eca9eaa090;hb=576fe7bace3061ef949762141afbb3eb2ec31ecd;hpb=9a7cf6459c40d53b58634f2df56386bf52c12f7c diff --git a/collate/strings.py b/collate/strings.py index bc4ed62..5badc8c 100644 --- a/collate/strings.py +++ b/collate/strings.py @@ -10,15 +10,16 @@ CONTINUE_ON = frozenset([ UNKNOWN, LETTER, NUMBER = range(3) -BREAKER = u"\u2029" +BREAKER = u"\u2029" # Paragraph break character +INFINITY = float('inf') -def sortemes(string): +def sortemes(string, key=lambda s: s): """Generate a list of sortemes for the string. A sorteme, by analogy with grapheme/morpheme/etc. is an atom of sort information. This is larger than a word boundry but smaller than a sentence boundry; roughly, a sorteme boundry occurs between - letters and numbers, between numbers and numbrs if 'too much' + letters and numbers, between numbers and numbers if 'too much' punctuation exists in between, between lines. There is no formal specification for sortemes; the goal of this @@ -26,92 +27,110 @@ def sortemes(string): """ words = [] + letters = [] + digits = [] if not string: return words string = unicode(string) - start = None - last = None - mode = UNKNOWN - previous_mode = UNKNOWN - category = "XX" + categories = map(unicodedata.category, string) + previous = UNKNOWN + + def stripends(word): + while word and unicodedata.category(word[0])[0] in "PS": + word = word[1:] + while word and unicodedata.category(word[-1])[0] in "PS": + word = word[:-1] + return word + + def aletters(letters): + words.append((INFINITY, stripends(letters))) + def adigits(digits): + words.append((numeric(digits), u'')) # TODO(jfw): This kind of evolved over time, there's probably a much # faster / more concise way to express it now. - for i, c in enumerate(string): - broke = False - prev_category = category - this_mode = mode - category = unicodedata.category(c) + for i, (c, category) in enumerate(zip(string, categories)): + + if letters and previous == LETTER and words: + word = stripends(words.pop()[1].strip()) + BREAKER + letters.insert(0, word) + previous = UNKNOWN # Split at the first letter following a number or # non-continuing character. if category[0] == "L": - if mode != LETTER: - broke = True - mode = LETTER + letters.append(c) + if digits: + adigits(u"".join(digits).strip()) + digits = [] + previous = NUMBER # Split at the first number following a non-number or # non-continuing character. elif category[0] == "N": - if mode != NUMBER: - broke = True - mode = NUMBER - - # Split if we find a non-continuing character ("weird" ones). - elif category not in CONTINUE_ON: - broke = True - mode = UNKNOWN + digits.append(c) + if letters: + aletters(u"".join(letters)) + letters = [] + previous = LETTER # Only certain punctuation allowed in numbers. - elif mode == NUMBER and category[0] == "P" and c not in "',._": - broke = True - mode = UNKNOWN + elif digits and c not in "',._": + adigits(u"".join(digits)) + digits = [] + previous = NUMBER + + # Split if we find a non-continuing character ("weird" ones). + elif letters and category not in CONTINUE_ON: + if letters: + aletters(u"".join(letters).strip() + BREAKER) + letters = [] + previous = LETTER + if digits: + adigits(u"".join(digits).strip()) + digits = [] + previous = NUMBER # Split if we find two pieces of punctuation in a row, even # if we should otherwise continue. - elif i > 0 and prev_category[0] == "P" and category[0] == "P": - broke = True - mode = UNKNOWN - - if broke and start is not None and last is not None: - # If we read two strings separated by weird punctuation, - # pretend the punctuation isn't there. - if (this_mode == previous_mode == LETTER - and words): - words[-1] += BREAKER + string[start:last+1] - else: - # This ensures "foo2 bar" sorts as ["foo ", 2, "bar"] - # Which sorts after ["foo", "bar"]. - if this_mode == NUMBER and previous_mode == LETTER and words: - words[-1] += BREAKER - words.append(string[start:last+1]) - previous_mode = this_mode - - if broke: - start = i - last = None - if category[0] in "LN": - last = i - this_mode = mode - if start is not None and last is not None: - if this_mode == LETTER and previous_mode == LETTER and words: - words[-1] += BREAKER + string[start:last+1] + elif i and categories[i-1][0] in "P" and category[0] in "P": + if letters: + aletters(u"".join(letters)) + letters = [] + previous = LETTER + if digits: + adigits(u"".join(digits)) + digits = [] + previous = NUMBER + else: - if this_mode == NUMBER and previous_mode == LETTER and words: - words[-1] += BREAKER - words.append(string[start:last+1]) - return words + if digits: + digits.append(c) + elif letters: + letters.append(c) -def numeric(orig, invalid=float('inf')): + if letters and previous == LETTER and words: + word = stripends(words.pop()[1].strip()) + BREAKER + letters.insert(0, word) + previous = UNKNOWN + + if letters: + aletters(u"".join(letters)) + if digits: + adigits(u"".join(digits)) + + return [(i, key(w) if w else u'') for i, w in words] + +def numeric(orig, invalid=INFINITY): if not orig: - return (invalid, '') + return invalid string = unicode(orig) for c in string: if c.isnumeric(): break else: - return (invalid, orig) + return invalid mult = 1 while string[:1] == u"-" or string[:1] == u"+": @@ -124,12 +143,6 @@ def numeric(orig, invalid=float('inf')): string = normalize_punc(string) - # Early out if possible. - try: - return (float(string) * mult, orig) - except ValueError: - pass - # Otherwise we need to do this the hard way. def _numeric(string): total = 0 @@ -144,9 +157,9 @@ def numeric(orig, invalid=float('inf')): whole, frac = string.split(".") whole = _numeric(whole) frac = _numeric(frac) / (10.0 ** len(frac)) - return (mult * (whole + frac), orig) + return mult * (whole + frac) except ValueError: - return (mult * _numeric(string), orig) + return mult * _numeric(string) def normalize_punc(string): string = unicode(string.strip(u",.'"))