From e8b221311bf60883d3f14083bab63fd947d34509 Mon Sep 17 00:00:00 2001 From: Joe Wreschnig Date: Mon, 22 Feb 2010 20:01:12 -0800 Subject: [PATCH 1/1] More release preparation. Docstrings and consistency work. --- README.txt | 2 -- collate/__init__.py | 6 ++++ collate/_locale.py | 22 ++++++------ collate/errors.py | 2 +- collate/strings.py | 85 +++++++++++++++++++++++++++++++-------------- setup.py | 12 ++++++- 6 files changed, 87 insertions(+), 42 deletions(-) diff --git a/README.txt b/README.txt index c1f9236..31f5bb6 100644 --- a/README.txt +++ b/README.txt @@ -1,5 +1,3 @@ -This module is unsuitable for real-world use at this time. - pycollate - Collation algorithms for Python ------------------------------------------- diff --git a/collate/__init__.py b/collate/__init__.py index 52d9e03..5094071 100644 --- a/collate/__init__.py +++ b/collate/__init__.py @@ -11,6 +11,12 @@ If available, this module uses the ICU localization library. Otherwise, it uses the system's locale database (and produces significantly worse results). +This module tries very hard not to fail loudly. It tends to ignore +most Unicode recoding errors, and will eventually fall back to the C +locale or raw codepoint-based collation. If you would like loud +failure, you can use the collate.strings module and the individual +Collators directly. + Trivial Use: ------------ strings = read_strings(...) diff --git a/collate/_locale.py b/collate/_locale.py index d20e184..f297f6b 100644 --- a/collate/_locale.py +++ b/collate/_locale.py @@ -46,21 +46,21 @@ def localelist(*locales): added = set() retlist = [] - for code in locales: - if not code: + for locale_ in locales: + if not locale_: continue if locale is not None: - code = locale.normalize(code) + locale_ = locale.normalize(locale_) # Strip off encoding if present. - code = code.split(".")[0] - if code.lower() not in added: - retlist.append(code) - added.add(code.lower()) + locale_ = locale_.split(".")[0] + if locale_.lower() not in added: + retlist.append(locale_) + added.add(locale_.lower()) # Strip off territory if present. - code = code.split("_")[0] - if code.lower() not in added: - retlist.append(code) - added.add(code.lower()) + locale_ = locale_.split("_")[0] + if locale_.lower() not in added: + retlist.append(locale_) + added.add(locale_.lower()) return retlist diff --git a/collate/errors.py b/collate/errors.py index a425087..13a1e12 100644 --- a/collate/errors.py +++ b/collate/errors.py @@ -8,4 +8,4 @@ class InvalidLocaleError(LookupError): """ def __init__(self, locale, string=""): self.locale = locale - ValueError.__init__(self, string or locale) + LookupError.__init__(self, string or locale) diff --git a/collate/strings.py b/collate/strings.py index 5badc8c..8d6af99 100644 --- a/collate/strings.py +++ b/collate/strings.py @@ -1,3 +1,7 @@ +"""String utility functions for collation.""" + +__all__ = ["sortemes", "numeric", "normalize_number"] + import unicodedata CONTINUE_ON = frozenset([ @@ -13,6 +17,17 @@ UNKNOWN, LETTER, NUMBER = range(3) BREAKER = u"\u2029" # Paragraph break character INFINITY = float('inf') +KEEP_IN_NUMBERS = u"'.," +ALLOWED_IN_NUMBERS = KEEP_IN_NUMBERS + u"_" + +def stripends(word): + """Strip punctuation and symbols from the ends of a string.""" + while word and unicodedata.category(word[0])[0] in "PS": + word = word[1:] + while word and unicodedata.category(word[-1])[0] in "PS": + word = word[:-1] + return word + def sortemes(string, key=lambda s: s): """Generate a list of sortemes for the string. @@ -24,6 +39,7 @@ def sortemes(string, key=lambda s: s): There is no formal specification for sortemes; the goal of this function is to provide good output for Collator.sortemekey. + """ words = [] @@ -35,21 +51,16 @@ def sortemes(string, key=lambda s: s): categories = map(unicodedata.category, string) previous = UNKNOWN - def stripends(word): - while word and unicodedata.category(word[0])[0] in "PS": - word = word[1:] - while word and unicodedata.category(word[-1])[0] in "PS": - word = word[:-1] - return word - def aletters(letters): + """Add a group of letters to the word list.""" words.append((INFINITY, stripends(letters))) def adigits(digits): + """Add a group of digits to the word list.""" words.append((numeric(digits), u'')) # TODO(jfw): This kind of evolved over time, there's probably a much # faster / more concise way to express it now. - for i, (c, category) in enumerate(zip(string, categories)): + for i, (uchar, category) in enumerate(zip(string, categories)): if letters and previous == LETTER and words: word = stripends(words.pop()[1].strip()) + BREAKER @@ -59,7 +70,7 @@ def sortemes(string, key=lambda s: s): # Split at the first letter following a number or # non-continuing character. if category[0] == "L": - letters.append(c) + letters.append(uchar) if digits: adigits(u"".join(digits).strip()) digits = [] @@ -68,14 +79,14 @@ def sortemes(string, key=lambda s: s): # Split at the first number following a non-number or # non-continuing character. elif category[0] == "N": - digits.append(c) + digits.append(uchar) if letters: aletters(u"".join(letters)) letters = [] previous = LETTER # Only certain punctuation allowed in numbers. - elif digits and c not in "',._": + elif digits and uchar not in ALLOWED_IN_NUMBERS: adigits(u"".join(digits)) digits = [] previous = NUMBER @@ -105,9 +116,9 @@ def sortemes(string, key=lambda s: s): else: if digits: - digits.append(c) + digits.append(uchar) elif letters: - letters.append(c) + letters.append(uchar) if letters and previous == LETTER and words: word = stripends(words.pop()[1].strip()) + BREAKER @@ -122,12 +133,20 @@ def sortemes(string, key=lambda s: s): return [(i, key(w) if w else u'') for i, w in words] def numeric(orig, invalid=INFINITY): + """Parse a number out of a string. + + This function parses a unicode number out of the start of a + string. If a number cannot be found at the start, the 'invalid' + argument is returned. + + """ + if not orig: return invalid string = unicode(orig) - for c in string: - if c.isnumeric(): + for uchar in string: + if uchar.isnumeric(): break else: return invalid @@ -139,18 +158,18 @@ def numeric(orig, invalid=INFINITY): string = string[1:] if not string[:1].isnumeric(): - return (invalid, orig) + return invalid - string = normalize_punc(string) + string = normalize_number(string) - # Otherwise we need to do this the hard way. def _numeric(string): + """Interpreter a number as base 10.""" total = 0 - for c in string: - v = unicodedata.numeric(c) - if v >= 1 or v == 0: + for uchar in string: + number = unicodedata.numeric(uchar) + if number >= 1 or number == 0: total *= 10 - total += v + total += number return total try: @@ -161,9 +180,21 @@ def numeric(orig, invalid=INFINITY): except ValueError: return mult * _numeric(string) -def normalize_punc(string): - string = unicode(string.strip(u",.'")) - string = filter(lambda u: u.isnumeric() or u in u",.'", string) +def normalize_number(string): + """Normalize punctuation in a number. + + This function attempts to guess which characters in a number + represent grouping separators and which represent decimal + points. It returns a string that is valid to pass to Python's + float() routine (potentially, NaN, if nothing like a number is + found). + + """ + + string = unicode(string) + string = filter(lambda u: u.isnumeric() or u in KEEP_IN_NUMBERS, string) + string = string.strip(KEEP_IN_NUMBERS) + commas = string.count(u",") stops = string.count(u".") quotes = string.count(u"'") @@ -180,7 +211,7 @@ def normalize_punc(string): quotes = 0 def normalize_two(a, b, string): - # One of each - assume the first is grouping, second is point. + """One of each - assume the first is grouping, second is point.""" a_idx = string.rindex(a) b_idx = string.rindex(b) if a_idx > b_idx: @@ -241,4 +272,4 @@ def normalize_punc(string): # Single stop, but no decimal - probably grouping. string = string.replace(u".", u"") - return string + return string or "NaN" diff --git a/setup.py b/setup.py index 111cfac..99a13af 100755 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ else: libraries = ['icui18n', 'icuuc', 'icudata'] setup(name='collate', - version='0', + version='0.1', author="Joe Wreschnig", author_email="joe.wreschnig@gmail.com", description="Python text collation", @@ -22,4 +22,14 @@ setup(name='collate', libraries=libraries)], cmdclass=dict(build_ext=build_ext), packages=["collate", "collate.icu"], + long_description="""\ +This module provides tools to sort strings in a 'human-expected' +order. Because human expectations are fuzzy and often +self-contradictory, the sort order is not guaranteed to be stable +between versions of this module (rather the opposite - the primary +reason to update it will probably be changed sort results). If +available, this module uses the ICU localization library. Otherwise, +it uses the system's locale database (and produces significantly worse +results). +""" ) -- 2.20.1