X-Git-Url: https://git.korewanetadesu.com/?p=python-collate.git;a=blobdiff_plain;f=collate%2F_strings.py;h=aed2ba71c5394a0de89c5d05edb3ce65b3013efa;hp=6bcfd9c5393c824e12edf84257c92866b9c48555;hb=c2e6d9413b5780bf96302569cf641f09de90f9fa;hpb=96cd5d3ad9dd1390c7739a6c3b9fa03ac3a2b4ff diff --git a/collate/_strings.py b/collate/_strings.py index 6bcfd9c..aed2ba7 100644 --- a/collate/_strings.py +++ b/collate/_strings.py @@ -1,84 +1,175 @@ -def alnumsplit(string): +import unicodedata + +CONTINUE_ON = frozenset([ + "Ll", "Lm", "Lo", "Lt", "Lu", + "Mc", "Me", "Mn", + "Nd", "Nl", "No", + "Po", + "Zs", + ]) + +UNKNOWN, LETTER, NUMBER = range(3) + +def sortemes(string): + """Generate a list of sortemes for the string. + + A sorteme, by analogy with grapheme/morpheme/etc. is an atom of + sort information. This is larger than a word boundry but smaller + than a sentence boundry; roughly, a sorteme boundry occurs between + letters and numbers, between numbers and numbrs if 'too much' + punctuation exists in between, between lines. + + There is no formal specification for sortemes; the goal of this + function is to provide good output for Collator.sortemekey. + """ + + words = [] + if not string: + return words string = unicode(string) - strings = [] - word = [] - numeric = None - for char in string: - if numeric is None: - broke = False - if char.isnumeric(): - numeric = True - elif char.isalpha(): - numeric = False - elif numeric and char.isalpha(): + start = None + last = None + mode = UNKNOWN + previous_mode = UNKNOWN + category = "XX" + for i, c in enumerate(string): + broke = False + prev_category = category + this_mode = mode + category = unicodedata.category(c) + + # Split at the first letter following a number or + # non-continuing character. + if category[0] == "L": + if mode != LETTER: + broke = True + mode = LETTER + + # Split at the first number following a non-number or + # non-continuing character. + elif category[0] == "N": + if mode != NUMBER: + broke = True + mode = NUMBER + + # Split if we find a non-continuing character ("weird" ones). + elif category not in CONTINUE_ON: + broke = True + mode = UNKNOWN + + # Only certain punctuation allowed in numbers. + elif mode == NUMBER and category[0] == "P" and c not in "',._": broke = True - elif not numeric and char.isnumeric(): + mode = UNKNOWN + + # Split if we find two pieces of punctuation in a row, even + # if we should otherwise continue. + elif i > 0 and prev_category[0] == "P" and category[0] == "P": broke = True + mode = UNKNOWN + + if broke and start is not None and last is not None: + # If we read two strings separated by weird punctuation, + # pretend the punctuation isn't there. + if (this_mode == previous_mode == LETTER + and (category[0] == "P" or prev_category[0] == "P") + and words): + words[-1] += u" " + string[start:last+1] + else: + # This ensures "foo2 bar" sorts as ["foo ", 2, "bar"] + # Which sorts after ["foo", "bar"]. + if this_mode == NUMBER and previous_mode == LETTER and words: + words[-1] += u" " + words.append(string[start:last+1]) + previous_mode = this_mode + if broke: - if word: - strings.append("".join(word)) - word = [] - numeric = None - word.append(char) - if word: - strings.append("".join(word)) - return strings - -def wordlike(string): - """Check if a string is 'word-like'. - - Word-like strings contain at least one alphanumeric character. - """ - return any(map(type(string).isalnum, string)) + start = i + last = None + if category[0] in "LN": + last = i + this_mode = mode + if start is not None and last is not None: + if this_mode == LETTER and previous_mode == LETTER and words: + words[-1] += u" " + string[start:last+1] + else: + if this_mode == NUMBER and previous_mode == LETTER and words: + words[-1] += u" " + words.append(string[start:last+1]) + return words -def numeric(string, invalid=float('inf')): - string = unicode(string) - if not any(map(type(string).isnumeric, string)): - return (invalid, string) - if not string: +def numeric(orig, invalid=float('inf')): + if not orig: return (invalid, '') + string = unicode(orig) + for c in string: + if c.isnumeric(): + break + else: + return (invalid, orig) + mult = 1 - while string[:1] == "-" or string[:1] == "+": - if string[0] == "-": + while string[:1] == u"-" or string[:1] == u"+": + if string[:1] == u"-": mult = -mult string = string[1:] - # Maybe we got lucky and this is a trivial case... + if not string[:1].isnumeric(): + return (invalid, orig) + + string = normalize_punc(string) + + # Early out if possible. try: - return float(string) * mult + return (float(string) * mult, orig) except ValueError: pass # Otherwise we need to do this the hard way. - return mult * float(normalize_dots(string)) + def _numeric(string): + total = 0 + for c in string: + v = unicodedata.numeric(c) + if v >= 1 or v == 0: + total *= 10 + total += v + return total -def normalize_dots(string): - string = unicode(string.strip(",.'")) - string = filter(lambda u: u.isnumeric() or u in ",.'", string) - commas = string.count(",") - stops = string.count(".") - quotes = string.count("'") + try: + whole, frac = string.split(".") + whole = _numeric(whole) + frac = _numeric(frac) / (10.0 ** len(frac)) + return (mult * (whole + frac), orig) + except ValueError: + return (mult * _numeric(string), orig) + +def normalize_punc(string): + string = unicode(string.strip(u",.'")) + string = filter(lambda u: u.isnumeric() or u in u",.'", string) + commas = string.count(u",") + stops = string.count(u".") + quotes = string.count(u"'") # If anything occurs more than once, it's a separator. if commas > 1: - string = string.replace(",", "") + string = string.replace(u",", u"") commas = 0 if stops > 1: - string = string.replace(".", "") + string = string.replace(u".", u"") stops = 0 if quotes > 1: - string = string.replace("'", "") + string = string.replace(u"'", u"") quotes = 0 - def normalize_two(a, b): + def normalize_two(a, b, string): # One of each - assume the first is grouping, second is point. a_idx = string.rindex(a) b_idx = string.rindex(b) if a_idx > b_idx: - string = string.replace(b, "").replace(a, ".") + string = string.replace(b, u"").replace(a, u".") else: - string = string.replace(a, "").replace(b, ".") + string = string.replace(a, u"").replace(b, u".") return string if commas and stops and quotes: @@ -90,37 +181,47 @@ def normalize_dots(string): # Not really valid, so do whatever we want... # A'AAA.BB,CC # A'AAA,BB.CC - comma_idx = string.index(",") - stops_idx = string.index(".") - quotes_idx = string.index("'") + comma_idx = string.index(u",") + stops_idx = string.index(u".") + quotes_idx = string.index(u"'") if (comma_idx < stops_idx < quotes_idx or quotes_idx < stops_idx < comma_idx): - string = string.replace(",", "").replace("'", "") + string = string.replace(u",", u"").replace(u"'", u"") elif (comma_idx < quotes_idx < stops_idx or stops_idx < quotes_idx < comma_idx): - string = string.replace(",", "").replace(".", "").replace("'", ".") + string = string.replace( + u",", u"").replace( + u".", u"").replace( + u"'", u".") else: - string = string.replace("'", "").replace(".", "").replace(",", ".") + string = string.replace( + u"'", u"").replace( + u".", u"").replace( + u",", u".") elif stops and quotes: - string = normalize_two('.', "'") + string = normalize_two(u".", u"'", string) elif commas and quotes: - string = normalize_two(',', "'") + string = normalize_two(u",", u"'", string) elif commas and stops: - string = normalize_two(',', '.') + string = normalize_two(u",", u".", string) elif commas: - if string[-4:-3] == "," and len(string) <= 7: + if string[-4:-3] == u"," and len(string) <= 7: # Single comma as a thousands separator. - string = string.replace(",", "") + string = string.replace(u",", u"") else: # Single comma, not thousands - probably a decimal point. - string = string.replace(",", ".") + string = string.replace(u",", u".") elif quotes: # Single quote, probably MM'SS", equivalent to a decimal point. - string = string.replace("'", ".") + string = string.replace(u"'", u".") + + elif stops and string[-4:] == ".000": + # Single stop, but no decimal - probably grouping. + string = string.replace(u".", u"") return string