X-Git-Url: https://git.korewanetadesu.com/?p=python-collate.git;a=blobdiff_plain;f=collate%2Fstrings.py;h=487257ffe6e8a02dbdde8b9be669a25d1a976ba5;hp=bc4ed629aec5a9b8be46d2da1e9779eca9eaa090;hb=a3bd28edb9b44a1531af3ef8c4ae9cd6a4e2d3b3;hpb=9a7cf6459c40d53b58634f2df56386bf52c12f7c diff --git a/collate/strings.py b/collate/strings.py index bc4ed62..487257f 100644 --- a/collate/strings.py +++ b/collate/strings.py @@ -1,3 +1,7 @@ +"""String utility functions for collation.""" + +__all__ = ["sortemes", "numeric", "normalize_number", "deroman"] + import unicodedata CONTINUE_ON = frozenset([ @@ -10,108 +14,170 @@ CONTINUE_ON = frozenset([ UNKNOWN, LETTER, NUMBER = range(3) -BREAKER = u"\u2029" - -def sortemes(string): +BREAKER = u"\u2028" # Line break character +HBREAKER = u"\u2029" # Paragraph break character +INFINITY = float('inf') + +KEEP_IN_NUMBERS = u"'.," +ALLOWED_IN_NUMBERS = KEEP_IN_NUMBERS + u"_" + +ROMAN = { + u"i": 1, + u"v": 5, + u"x": 10, + u"l": 50, + u"c": 100, + u"d": 500, + u"m": 1000, + u"\u2180": 1000, + u"\u2181": 5000, + u"\u2182": 10000, + u"\u2183": 100, + u"\u2184": 100, + u"\u2185": 6, + u"\u2186": 50, + u"\u2187": 50000, + u"\u2188": 100000, + } + +def stripends(word): + """Strip punctuation and symbols from the ends of a string.""" + while word and unicodedata.category(word[0])[0] in "PS": + word = word[1:] + while word and unicodedata.category(word[-1])[0] in "PS": + word = word[:-1] + return word + +def sortemes(string, key=lambda s: s): """Generate a list of sortemes for the string. A sorteme, by analogy with grapheme/morpheme/etc. is an atom of sort information. This is larger than a word boundry but smaller than a sentence boundry; roughly, a sorteme boundry occurs between - letters and numbers, between numbers and numbrs if 'too much' + letters and numbers, between numbers and numbers if 'too much' punctuation exists in between, between lines. There is no formal specification for sortemes; the goal of this function is to provide good output for Collator.sortemekey. + """ - words = [] if not string: - return words + return [] + + words = [] + letters = [] + digits = [] + lappend = letters.append + dappend = digits.append string = unicode(string) - start = None - last = None - mode = UNKNOWN - previous_mode = UNKNOWN - category = "XX" - - # TODO(jfw): This kind of evolved over time, there's probably a much - # faster / more concise way to express it now. - for i, c in enumerate(string): - broke = False - prev_category = category - this_mode = mode - category = unicodedata.category(c) + categories = map(unicodedata.category, string) + previous = UNKNOWN + wappend = words.append + join = u"".join + i = 0 + + for uchar in string: + category = categories[i] + + if letters and previous == LETTER and words: + word = stripends(words.pop()[1].strip()) + BREAKER + letters.insert(0, word) + previous = UNKNOWN # Split at the first letter following a number or # non-continuing character. if category[0] == "L": - if mode != LETTER: - broke = True - mode = LETTER + lappend(uchar) + if digits: + words.append((numeric(join(digits).strip()), u'')) + del(digits[:]) + previous = NUMBER # Split at the first number following a non-number or # non-continuing character. elif category[0] == "N": - if mode != NUMBER: - broke = True - mode = NUMBER + dappend(uchar) + if letters: + if unicodedata.category(letters[-1])[0] == "L": + lappend(HBREAKER) + wappend((INFINITY, stripends(join(letters)))) + del(letters[:]) + previous = LETTER + + # Only certain punctuation allowed in numbers. + elif digits and uchar not in ALLOWED_IN_NUMBERS: + words.append((numeric(join(digits)), u'')) + del(digits[:]) + previous = NUMBER # Split if we find a non-continuing character ("weird" ones). elif category not in CONTINUE_ON: - broke = True - mode = UNKNOWN - - # Only certain punctuation allowed in numbers. - elif mode == NUMBER and category[0] == "P" and c not in "',._": - broke = True - mode = UNKNOWN + if letters: + wappend( + (INFINITY, + stripends(join(letters).strip() + BREAKER))) + del(letters[:]) + previous = LETTER + if digits: + words.append((numeric(join(digits)), u'')) + del(digits[:]) + previous = NUMBER # Split if we find two pieces of punctuation in a row, even # if we should otherwise continue. - elif i > 0 and prev_category[0] == "P" and category[0] == "P": - broke = True - mode = UNKNOWN - - if broke and start is not None and last is not None: - # If we read two strings separated by weird punctuation, - # pretend the punctuation isn't there. - if (this_mode == previous_mode == LETTER - and words): - words[-1] += BREAKER + string[start:last+1] - else: - # This ensures "foo2 bar" sorts as ["foo ", 2, "bar"] - # Which sorts after ["foo", "bar"]. - if this_mode == NUMBER and previous_mode == LETTER and words: - words[-1] += BREAKER - words.append(string[start:last+1]) - previous_mode = this_mode - - if broke: - start = i - last = None - if category[0] in "LN": - last = i - this_mode = mode - if start is not None and last is not None: - if this_mode == LETTER and previous_mode == LETTER and words: - words[-1] += BREAKER + string[start:last+1] + elif i and categories[i - 1][0] == category[0] == "P": + if letters: + wappend((INFINITY, stripends(join(letters)))) + del(letters[:]) + previous = LETTER + if digits: + words.append((numeric(join(digits)), u'')) + del(digits[:]) + previous = NUMBER + else: - if this_mode == NUMBER and previous_mode == LETTER and words: - words[-1] += BREAKER - words.append(string[start:last+1]) - return words + if digits: + dappend(uchar) + elif letters: + lappend(uchar) + + i += 1 + + if letters and previous == LETTER and words: + word = stripends(words.pop()[1].strip()) + BREAKER + letters.insert(0, word) + previous = UNKNOWN + + if letters: + wappend((INFINITY, stripends(join(letters)))) + if digits: + words.append((numeric(join(digits)), u'')) + + return [(i, key(w)) for i, w in words] + +def numeric(orig, invalid=INFINITY): + """Parse a number out of a string. + + This function parses a unicode number out of the start of a + string. If a number cannot be found at the start, the 'invalid' + argument is returned. + + """ -def numeric(orig, invalid=float('inf')): if not orig: - return (invalid, '') + return invalid string = unicode(orig) - for c in string: - if c.isnumeric(): + for uchar in string: + if uchar.isnumeric(): break else: - return (invalid, orig) + return invalid + + for char in string: + if u"\u2160" <= char <= u"\u2188": + return deroman(string) mult = 1 while string[:1] == u"-" or string[:1] == u"+": @@ -120,37 +186,43 @@ def numeric(orig, invalid=float('inf')): string = string[1:] if not string[:1].isnumeric(): - return (invalid, orig) + return invalid - string = normalize_punc(string) - - # Early out if possible. - try: - return (float(string) * mult, orig) - except ValueError: - pass + string = normalize_number(string) - # Otherwise we need to do this the hard way. def _numeric(string): + """Interpreter a number as base 10.""" total = 0 - for c in string: - v = unicodedata.numeric(c) - if v >= 1 or v == 0: + for uchar in string: + number = unicodedata.numeric(uchar) + if number >= 1 or number == 0: total *= 10 - total += v + total += number return total try: whole, frac = string.split(".") whole = _numeric(whole) frac = _numeric(frac) / (10.0 ** len(frac)) - return (mult * (whole + frac), orig) + return mult * (whole + frac) except ValueError: - return (mult * _numeric(string), orig) + return mult * _numeric(string) + +def normalize_number(string): + """Normalize punctuation in a number. + + This function attempts to guess which characters in a number + represent grouping separators and which represent decimal + points. It returns a string that is valid to pass to Python's + float() routine (potentially, NaN, if nothing like a number is + found). + + """ + + string = unicode(string) + string = filter(lambda u: u.isnumeric() or u in KEEP_IN_NUMBERS, string) + string = string.strip(KEEP_IN_NUMBERS) -def normalize_punc(string): - string = unicode(string.strip(u",.'")) - string = filter(lambda u: u.isnumeric() or u in u",.'", string) commas = string.count(u",") stops = string.count(u".") quotes = string.count(u"'") @@ -167,7 +239,7 @@ def normalize_punc(string): quotes = 0 def normalize_two(a, b, string): - # One of each - assume the first is grouping, second is point. + """One of each - assume the first is grouping, second is point.""" a_idx = string.rindex(a) b_idx = string.rindex(b) if a_idx > b_idx: @@ -228,4 +300,21 @@ def normalize_punc(string): # Single stop, but no decimal - probably grouping. string = string.replace(u".", u"") - return string + return string or "NaN" + +def deroman(string): + """Turn a Roman numeral into an integer.""" + string = unicodedata.normalize('NFKD', unicode(string)).lower() + previous = 0 + building = 0 + for char in reversed(string): + try: + value = ROMAN[char] + except KeyError: + continue + if value < previous: + building -= value + else: + building += value + previous = value + return building