From: Joe Wreschnig Date: Fri, 19 Feb 2010 09:23:57 +0000 (-0800) Subject: More tweaks; notably try to insert paragraph breaks rather than a separate Python... X-Git-Url: https://git.korewanetadesu.com/?p=python-collate.git;a=commitdiff_plain;h=9a7cf6459c40d53b58634f2df56386bf52c12f7c More tweaks; notably try to insert paragraph breaks rather than a separate Python tuple when re-concatenating strings. --- diff --git a/collate/_abcollator.py b/collate/_abcollator.py index fdd7783..0ae5d45 100644 --- a/collate/_abcollator.py +++ b/collate/_abcollator.py @@ -1,14 +1,28 @@ -import collate._strings +import collate.strings class Collator(object): def cmp(self, string1, string2): """Return negative if a < b, zero if a == b, positive if a > b.""" return cmp(self.key(string1), self.key(string2)) + def words(self, string): + """Split the string along word boundries.""" + if isinstance(string, str): + string = string.decode(self.encoding, 'replace') + return string.split() + def sortemekey(self, string, invalid=float('inf')): + """Return a key based on sortemes of a string. + + If the string is a str instance, it is decoded to a unicode + instance according to the 'encoding' attribute of the + Collator. + """ keys = [] - for sorteme in collate._strings.sortemes(string): - num, alpha = collate._strings.numeric(sorteme, invalid) + if isinstance(string, str): + string = string.decode(self.encoding, 'replace') + for sorteme in collate.strings.sortemes(string): + num, alpha = collate.strings.numeric(sorteme, invalid) if num == invalid: keys.append(self.key(alpha)) else: diff --git a/collate/_strings.py b/collate/_strings.py deleted file mode 100644 index aed2ba7..0000000 --- a/collate/_strings.py +++ /dev/null @@ -1,227 +0,0 @@ -import unicodedata - -CONTINUE_ON = frozenset([ - "Ll", "Lm", "Lo", "Lt", "Lu", - "Mc", "Me", "Mn", - "Nd", "Nl", "No", - "Po", - "Zs", - ]) - -UNKNOWN, LETTER, NUMBER = range(3) - -def sortemes(string): - """Generate a list of sortemes for the string. - - A sorteme, by analogy with grapheme/morpheme/etc. is an atom of - sort information. This is larger than a word boundry but smaller - than a sentence boundry; roughly, a sorteme boundry occurs between - letters and numbers, between numbers and numbrs if 'too much' - punctuation exists in between, between lines. - - There is no formal specification for sortemes; the goal of this - function is to provide good output for Collator.sortemekey. - """ - - words = [] - if not string: - return words - string = unicode(string) - start = None - last = None - mode = UNKNOWN - previous_mode = UNKNOWN - category = "XX" - for i, c in enumerate(string): - broke = False - prev_category = category - this_mode = mode - category = unicodedata.category(c) - - # Split at the first letter following a number or - # non-continuing character. - if category[0] == "L": - if mode != LETTER: - broke = True - mode = LETTER - - # Split at the first number following a non-number or - # non-continuing character. - elif category[0] == "N": - if mode != NUMBER: - broke = True - mode = NUMBER - - # Split if we find a non-continuing character ("weird" ones). - elif category not in CONTINUE_ON: - broke = True - mode = UNKNOWN - - # Only certain punctuation allowed in numbers. - elif mode == NUMBER and category[0] == "P" and c not in "',._": - broke = True - mode = UNKNOWN - - # Split if we find two pieces of punctuation in a row, even - # if we should otherwise continue. - elif i > 0 and prev_category[0] == "P" and category[0] == "P": - broke = True - mode = UNKNOWN - - if broke and start is not None and last is not None: - # If we read two strings separated by weird punctuation, - # pretend the punctuation isn't there. - if (this_mode == previous_mode == LETTER - and (category[0] == "P" or prev_category[0] == "P") - and words): - words[-1] += u" " + string[start:last+1] - else: - # This ensures "foo2 bar" sorts as ["foo ", 2, "bar"] - # Which sorts after ["foo", "bar"]. - if this_mode == NUMBER and previous_mode == LETTER and words: - words[-1] += u" " - words.append(string[start:last+1]) - previous_mode = this_mode - - if broke: - start = i - last = None - if category[0] in "LN": - last = i - this_mode = mode - if start is not None and last is not None: - if this_mode == LETTER and previous_mode == LETTER and words: - words[-1] += u" " + string[start:last+1] - else: - if this_mode == NUMBER and previous_mode == LETTER and words: - words[-1] += u" " - words.append(string[start:last+1]) - return words - -def numeric(orig, invalid=float('inf')): - if not orig: - return (invalid, '') - - string = unicode(orig) - for c in string: - if c.isnumeric(): - break - else: - return (invalid, orig) - - mult = 1 - while string[:1] == u"-" or string[:1] == u"+": - if string[:1] == u"-": - mult = -mult - string = string[1:] - - if not string[:1].isnumeric(): - return (invalid, orig) - - string = normalize_punc(string) - - # Early out if possible. - try: - return (float(string) * mult, orig) - except ValueError: - pass - - # Otherwise we need to do this the hard way. - def _numeric(string): - total = 0 - for c in string: - v = unicodedata.numeric(c) - if v >= 1 or v == 0: - total *= 10 - total += v - return total - - try: - whole, frac = string.split(".") - whole = _numeric(whole) - frac = _numeric(frac) / (10.0 ** len(frac)) - return (mult * (whole + frac), orig) - except ValueError: - return (mult * _numeric(string), orig) - -def normalize_punc(string): - string = unicode(string.strip(u",.'")) - string = filter(lambda u: u.isnumeric() or u in u",.'", string) - commas = string.count(u",") - stops = string.count(u".") - quotes = string.count(u"'") - - # If anything occurs more than once, it's a separator. - if commas > 1: - string = string.replace(u",", u"") - commas = 0 - if stops > 1: - string = string.replace(u".", u"") - stops = 0 - if quotes > 1: - string = string.replace(u"'", u"") - quotes = 0 - - def normalize_two(a, b, string): - # One of each - assume the first is grouping, second is point. - a_idx = string.rindex(a) - b_idx = string.rindex(b) - if a_idx > b_idx: - string = string.replace(b, u"").replace(a, u".") - else: - string = string.replace(a, u"").replace(b, u".") - return string - - if commas and stops and quotes: - # If all three, assume the middle is the decimal point. - # A,AAA.BB'CC - # A.AAA,BB'CC - # A,AAA'BB.CC - # A.AAA'BB,CC - # Not really valid, so do whatever we want... - # A'AAA.BB,CC - # A'AAA,BB.CC - comma_idx = string.index(u",") - stops_idx = string.index(u".") - quotes_idx = string.index(u"'") - if (comma_idx < stops_idx < quotes_idx - or quotes_idx < stops_idx < comma_idx): - string = string.replace(u",", u"").replace(u"'", u"") - elif (comma_idx < quotes_idx < stops_idx - or stops_idx < quotes_idx < comma_idx): - string = string.replace( - u",", u"").replace( - u".", u"").replace( - u"'", u".") - else: - string = string.replace( - u"'", u"").replace( - u".", u"").replace( - u",", u".") - - elif stops and quotes: - string = normalize_two(u".", u"'", string) - - elif commas and quotes: - string = normalize_two(u",", u"'", string) - - elif commas and stops: - string = normalize_two(u",", u".", string) - - elif commas: - if string[-4:-3] == u"," and len(string) <= 7: - # Single comma as a thousands separator. - string = string.replace(u",", u"") - else: - # Single comma, not thousands - probably a decimal point. - string = string.replace(u",", u".") - - elif quotes: - # Single quote, probably MM'SS", equivalent to a decimal point. - string = string.replace(u"'", u".") - - elif stops and string[-4:] == ".000": - # Single stop, but no decimal - probably grouping. - string = string.replace(u".", u"") - - return string diff --git a/collate/icu/__init__.py b/collate/icu/__init__.py index 892b8a1..5f3ec05 100644 --- a/collate/icu/__init__.py +++ b/collate/icu/__init__.py @@ -36,6 +36,13 @@ class Collator(collate._abcollator.Collator): # so this is a harmless error. self._breaker = _icu.WordBreaker("root") + def words(self, string): + """Split the string along word boundries.""" + if isinstance(string, str): + string = string.decode(self.encoding) + words = self._breaker.words(string) + return [w for w in words if not w.isspace()] + def key(self, string): """Sort key for a string. diff --git a/collate/strings.py b/collate/strings.py new file mode 100644 index 0000000..bc4ed62 --- /dev/null +++ b/collate/strings.py @@ -0,0 +1,231 @@ +import unicodedata + +CONTINUE_ON = frozenset([ + "Ll", "Lm", "Lo", "Lt", "Lu", + "Mc", "Me", "Mn", + "Nd", "Nl", "No", + "Po", + "Zs", + ]) + +UNKNOWN, LETTER, NUMBER = range(3) + +BREAKER = u"\u2029" + +def sortemes(string): + """Generate a list of sortemes for the string. + + A sorteme, by analogy with grapheme/morpheme/etc. is an atom of + sort information. This is larger than a word boundry but smaller + than a sentence boundry; roughly, a sorteme boundry occurs between + letters and numbers, between numbers and numbrs if 'too much' + punctuation exists in between, between lines. + + There is no formal specification for sortemes; the goal of this + function is to provide good output for Collator.sortemekey. + """ + + words = [] + if not string: + return words + string = unicode(string) + start = None + last = None + mode = UNKNOWN + previous_mode = UNKNOWN + category = "XX" + + # TODO(jfw): This kind of evolved over time, there's probably a much + # faster / more concise way to express it now. + for i, c in enumerate(string): + broke = False + prev_category = category + this_mode = mode + category = unicodedata.category(c) + + # Split at the first letter following a number or + # non-continuing character. + if category[0] == "L": + if mode != LETTER: + broke = True + mode = LETTER + + # Split at the first number following a non-number or + # non-continuing character. + elif category[0] == "N": + if mode != NUMBER: + broke = True + mode = NUMBER + + # Split if we find a non-continuing character ("weird" ones). + elif category not in CONTINUE_ON: + broke = True + mode = UNKNOWN + + # Only certain punctuation allowed in numbers. + elif mode == NUMBER and category[0] == "P" and c not in "',._": + broke = True + mode = UNKNOWN + + # Split if we find two pieces of punctuation in a row, even + # if we should otherwise continue. + elif i > 0 and prev_category[0] == "P" and category[0] == "P": + broke = True + mode = UNKNOWN + + if broke and start is not None and last is not None: + # If we read two strings separated by weird punctuation, + # pretend the punctuation isn't there. + if (this_mode == previous_mode == LETTER + and words): + words[-1] += BREAKER + string[start:last+1] + else: + # This ensures "foo2 bar" sorts as ["foo ", 2, "bar"] + # Which sorts after ["foo", "bar"]. + if this_mode == NUMBER and previous_mode == LETTER and words: + words[-1] += BREAKER + words.append(string[start:last+1]) + previous_mode = this_mode + + if broke: + start = i + last = None + if category[0] in "LN": + last = i + this_mode = mode + if start is not None and last is not None: + if this_mode == LETTER and previous_mode == LETTER and words: + words[-1] += BREAKER + string[start:last+1] + else: + if this_mode == NUMBER and previous_mode == LETTER and words: + words[-1] += BREAKER + words.append(string[start:last+1]) + return words + +def numeric(orig, invalid=float('inf')): + if not orig: + return (invalid, '') + + string = unicode(orig) + for c in string: + if c.isnumeric(): + break + else: + return (invalid, orig) + + mult = 1 + while string[:1] == u"-" or string[:1] == u"+": + if string[:1] == u"-": + mult = -mult + string = string[1:] + + if not string[:1].isnumeric(): + return (invalid, orig) + + string = normalize_punc(string) + + # Early out if possible. + try: + return (float(string) * mult, orig) + except ValueError: + pass + + # Otherwise we need to do this the hard way. + def _numeric(string): + total = 0 + for c in string: + v = unicodedata.numeric(c) + if v >= 1 or v == 0: + total *= 10 + total += v + return total + + try: + whole, frac = string.split(".") + whole = _numeric(whole) + frac = _numeric(frac) / (10.0 ** len(frac)) + return (mult * (whole + frac), orig) + except ValueError: + return (mult * _numeric(string), orig) + +def normalize_punc(string): + string = unicode(string.strip(u",.'")) + string = filter(lambda u: u.isnumeric() or u in u",.'", string) + commas = string.count(u",") + stops = string.count(u".") + quotes = string.count(u"'") + + # If anything occurs more than once, it's a separator. + if commas > 1: + string = string.replace(u",", u"") + commas = 0 + if stops > 1: + string = string.replace(u".", u"") + stops = 0 + if quotes > 1: + string = string.replace(u"'", u"") + quotes = 0 + + def normalize_two(a, b, string): + # One of each - assume the first is grouping, second is point. + a_idx = string.rindex(a) + b_idx = string.rindex(b) + if a_idx > b_idx: + string = string.replace(b, u"").replace(a, u".") + else: + string = string.replace(a, u"").replace(b, u".") + return string + + if commas and stops and quotes: + # If all three, assume the middle is the decimal point. + # A,AAA.BB'CC + # A.AAA,BB'CC + # A,AAA'BB.CC + # A.AAA'BB,CC + # Not really valid, so do whatever we want... + # A'AAA.BB,CC + # A'AAA,BB.CC + comma_idx = string.index(u",") + stops_idx = string.index(u".") + quotes_idx = string.index(u"'") + if (comma_idx < stops_idx < quotes_idx + or quotes_idx < stops_idx < comma_idx): + string = string.replace(u",", u"").replace(u"'", u"") + elif (comma_idx < quotes_idx < stops_idx + or stops_idx < quotes_idx < comma_idx): + string = string.replace( + u",", u"").replace( + u".", u"").replace( + u"'", u".") + else: + string = string.replace( + u"'", u"").replace( + u".", u"").replace( + u",", u".") + + elif stops and quotes: + string = normalize_two(u".", u"'", string) + + elif commas and quotes: + string = normalize_two(u",", u"'", string) + + elif commas and stops: + string = normalize_two(u",", u".", string) + + elif commas: + if string[-4:-3] == u"," and len(string) <= 7: + # Single comma as a thousands separator. + string = string.replace(u",", u"") + else: + # Single comma, not thousands - probably a decimal point. + string = string.replace(u",", u".") + + elif quotes: + # Single quote, probably MM'SS", equivalent to a decimal point. + string = string.replace(u"'", u".") + + elif stops and string[-4:] == ".000": + # Single stop, but no decimal - probably grouping. + string = string.replace(u".", u"") + + return string