More release preparation. Docstrings and consistency work.

author Joe Wreschnig <joe.wreschnig@gmail.com>

Tue, 23 Feb 2010 04:01:12 +0000 (20:01 -0800)

committer Joe Wreschnig <joe.wreschnig@gmail.com>

Tue, 23 Feb 2010 04:01:12 +0000 (20:01 -0800)
author Joe Wreschnig <joe.wreschnig@gmail.com>
Tue, 23 Feb 2010 04:01:12 +0000 (20:01 -0800)
committer Joe Wreschnig <joe.wreschnig@gmail.com>
Tue, 23 Feb 2010 04:01:12 +0000 (20:01 -0800)
diff --git a/README.txt b/README.txt

index c1f9236..31f5bb6 100644 (file)
--- a/README.txt
+++ b/README.txt
@@ -1,5 +1,3 @@
-This module is unsuitable for real-world use at this time.
-
  pycollate - Collation algorithms for Python
  -------------------------------------------
  
  pycollate - Collation algorithms for Python
  -------------------------------------------
  
diff --git a/collate/__init__.py b/collate/__init__.py

index 52d9e03..5094071 100644 (file)
--- a/collate/__init__.py
+++ b/collate/__init__.py
@@ -11,6 +11,12 @@ If available, this module uses the ICU localization library.
  Otherwise, it uses the system's locale database (and produces
  significantly worse results).
  
  Otherwise, it uses the system's locale database (and produces
  significantly worse results).
  
+This module tries very hard not to fail loudly. It tends to ignore
+most Unicode recoding errors, and will eventually fall back to the C
+locale or raw codepoint-based collation. If you would like loud
+failure, you can use the collate.strings module and the individual
+Collators directly.
+
  Trivial Use:
  ------------
  strings = read_strings(...)
  Trivial Use:
  ------------
  strings = read_strings(...)
diff --git a/collate/_locale.py b/collate/_locale.py

index d20e184..f297f6b 100644 (file)
--- a/collate/_locale.py
+++ b/collate/_locale.py
@@ -46,21 +46,21 @@ def localelist(*locales):
      added = set()
      retlist = []
  
      added = set()
      retlist = []
  
-    for code in locales:
-        if not code:
+    for locale_ in locales:
+        if not locale_:
              continue
          if locale is not None:
              continue
          if locale is not None:
-            code = locale.normalize(code)
+            locale_ = locale.normalize(locale_)
          # Strip off encoding if present.
          # Strip off encoding if present.
-        code = code.split(".")[0]
-        if code.lower() not in added:
-            retlist.append(code)
-            added.add(code.lower())
+        locale_ = locale_.split(".")[0]
+        if locale_.lower() not in added:
+            retlist.append(locale_)
+            added.add(locale_.lower())
          # Strip off territory if present.
          # Strip off territory if present.
-        code = code.split("_")[0]
-        if code.lower() not in added:
-            retlist.append(code)
-            added.add(code.lower())
+        locale_ = locale_.split("_")[0]
+        if locale_.lower() not in added:
+            retlist.append(locale_)
+            added.add(locale_.lower())
  
      return retlist
  
  
      return retlist
  
diff --git a/collate/errors.py b/collate/errors.py

index a425087..13a1e12 100644 (file)
--- a/collate/errors.py
+++ b/collate/errors.py
@@ -8,4 +8,4 @@ class InvalidLocaleError(LookupError):
      """
      def __init__(self, locale, string=""):
          self.locale = locale
      """
      def __init__(self, locale, string=""):
          self.locale = locale
-        ValueError.__init__(self, string or locale)
+        LookupError.__init__(self, string or locale)
diff --git a/collate/strings.py b/collate/strings.py

index 5badc8c..8d6af99 100644 (file)
--- a/collate/strings.py
+++ b/collate/strings.py
@@ -1,3 +1,7 @@
+"""String utility functions for collation."""
+
+__all__ = ["sortemes", "numeric", "normalize_number"]
+
  import unicodedata
  
  CONTINUE_ON = frozenset([
  import unicodedata
  
  CONTINUE_ON = frozenset([
@@ -13,6 +17,17 @@ UNKNOWN, LETTER, NUMBER = range(3)
  BREAKER = u"\u2029" # Paragraph break character
  INFINITY = float('inf')
  
  BREAKER = u"\u2029" # Paragraph break character
  INFINITY = float('inf')
  
+KEEP_IN_NUMBERS = u"'.,"
+ALLOWED_IN_NUMBERS = KEEP_IN_NUMBERS + u"_"
+
+def stripends(word):
+    """Strip punctuation and symbols from the ends of a string."""
+    while word and unicodedata.category(word[0])[0] in "PS":
+        word = word[1:]
+    while word and unicodedata.category(word[-1])[0] in "PS":
+        word = word[:-1]
+    return word
+
  def sortemes(string, key=lambda s: s):
      """Generate a list of sortemes for the string.
  
  def sortemes(string, key=lambda s: s):
      """Generate a list of sortemes for the string.
  
@@ -24,6 +39,7 @@ def sortemes(string, key=lambda s: s):
  
      There is no formal specification for sortemes; the goal of this
      function is to provide good output for Collator.sortemekey.
  
      There is no formal specification for sortemes; the goal of this
      function is to provide good output for Collator.sortemekey.
+
      """
  
      words = []
      """
  
      words = []
@@ -35,21 +51,16 @@ def sortemes(string, key=lambda s: s):
      categories = map(unicodedata.category, string)
      previous = UNKNOWN
  
      categories = map(unicodedata.category, string)
      previous = UNKNOWN
  
-    def stripends(word):
-        while word and unicodedata.category(word[0])[0] in "PS":
-            word = word[1:]
-        while word and unicodedata.category(word[-1])[0] in "PS":
-            word = word[:-1]
-        return word
-
      def aletters(letters):
      def aletters(letters):
+        """Add a group of letters to the word list."""
          words.append((INFINITY, stripends(letters)))
      def adigits(digits):
          words.append((INFINITY, stripends(letters)))
      def adigits(digits):
+        """Add a group of digits to the word list."""
          words.append((numeric(digits), u''))
  
      # TODO(jfw): This kind of evolved over time, there's probably a much
      # faster / more concise way to express it now.
          words.append((numeric(digits), u''))
  
      # TODO(jfw): This kind of evolved over time, there's probably a much
      # faster / more concise way to express it now.
-    for i, (c, category) in enumerate(zip(string, categories)):
+    for i, (uchar, category) in enumerate(zip(string, categories)):
  
          if letters and previous == LETTER and words:
              word = stripends(words.pop()[1].strip()) + BREAKER
  
          if letters and previous == LETTER and words:
              word = stripends(words.pop()[1].strip()) + BREAKER
@@ -59,7 +70,7 @@ def sortemes(string, key=lambda s: s):
          # Split at the first letter following a number or
          # non-continuing character.
          if category[0] == "L":
          # Split at the first letter following a number or
          # non-continuing character.
          if category[0] == "L":
-            letters.append(c)
+            letters.append(uchar)
              if digits:
                  adigits(u"".join(digits).strip())
                  digits = []
              if digits:
                  adigits(u"".join(digits).strip())
                  digits = []
@@ -68,14 +79,14 @@ def sortemes(string, key=lambda s: s):
          # Split at the first number following a non-number or
          # non-continuing character.
          elif category[0] == "N":
          # Split at the first number following a non-number or
          # non-continuing character.
          elif category[0] == "N":
-            digits.append(c)
+            digits.append(uchar)
              if letters:
                  aletters(u"".join(letters))
                  letters = []
                  previous = LETTER
  
          # Only certain punctuation allowed in numbers.
              if letters:
                  aletters(u"".join(letters))
                  letters = []
                  previous = LETTER
  
          # Only certain punctuation allowed in numbers.
-        elif digits and c not in "',._":
+        elif digits and uchar not in ALLOWED_IN_NUMBERS:
              adigits(u"".join(digits))
              digits = []
              previous = NUMBER
              adigits(u"".join(digits))
              digits = []
              previous = NUMBER
@@ -105,9 +116,9 @@ def sortemes(string, key=lambda s: s):
  
          else:
              if digits:
  
          else:
              if digits:
-                digits.append(c)
+                digits.append(uchar)
              elif letters:
              elif letters:
-                letters.append(c)
+                letters.append(uchar)
  
      if letters and previous == LETTER and words:
          word = stripends(words.pop()[1].strip()) + BREAKER
  
      if letters and previous == LETTER and words:
          word = stripends(words.pop()[1].strip()) + BREAKER
@@ -122,12 +133,20 @@ def sortemes(string, key=lambda s: s):
      return [(i, key(w) if w else u'') for i, w in words]
  
  def numeric(orig, invalid=INFINITY):
      return [(i, key(w) if w else u'') for i, w in words]
  
  def numeric(orig, invalid=INFINITY):
+    """Parse a number out of a string.
+
+    This function parses a unicode number out of the start of a
+    string. If a number cannot be found at the start, the 'invalid'
+    argument is returned.
+        
+    """
+
      if not orig:
          return invalid
  
      string = unicode(orig)
      if not orig:
          return invalid
  
      string = unicode(orig)
-    for c in string:
-        if c.isnumeric():
+    for uchar in string:
+        if uchar.isnumeric():
              break
      else:
          return invalid
              break
      else:
          return invalid
@@ -139,18 +158,18 @@ def numeric(orig, invalid=INFINITY):
          string = string[1:]
  
      if not string[:1].isnumeric():
          string = string[1:]
  
      if not string[:1].isnumeric():
-        return (invalid, orig)
+        return invalid
  
  
-    string = normalize_punc(string)
+    string = normalize_number(string)
  
  
-    # Otherwise we need to do this the hard way.
      def _numeric(string):
      def _numeric(string):
+        """Interpreter a number as base 10."""
          total = 0
          total = 0
-        for c in string:
-            v = unicodedata.numeric(c)
-            if v >= 1 or v == 0:
+        for uchar in string:
+            number = unicodedata.numeric(uchar)
+            if number >= 1 or number == 0:
                  total *= 10
                  total *= 10
-            total += v
+            total += number
          return total
  
      try:
          return total
  
      try:
@@ -161,9 +180,21 @@ def numeric(orig, invalid=INFINITY):
      except ValueError:
          return mult * _numeric(string)
  
      except ValueError:
          return mult * _numeric(string)
  
-def normalize_punc(string):
-    string = unicode(string.strip(u",.'"))
-    string = filter(lambda u: u.isnumeric() or u in u",.'", string)
+def normalize_number(string):
+    """Normalize punctuation in a number.
+
+    This function attempts to guess which characters in a number
+    represent grouping separators and which represent decimal
+    points. It returns a string that is valid to pass to Python's
+    float() routine (potentially, NaN, if nothing like a number is
+    found).
+
+    """
+
+    string = unicode(string)
+    string = filter(lambda u: u.isnumeric() or u in KEEP_IN_NUMBERS, string)
+    string = string.strip(KEEP_IN_NUMBERS)
+
      commas = string.count(u",")
      stops = string.count(u".")
      quotes = string.count(u"'")
      commas = string.count(u",")
      stops = string.count(u".")
      quotes = string.count(u"'")
@@ -180,7 +211,7 @@ def normalize_punc(string):
          quotes = 0
  
      def normalize_two(a, b, string):
          quotes = 0
  
      def normalize_two(a, b, string):
-        # One of each - assume the first is grouping, second is point.
+        """One of each - assume the first is grouping, second is point."""
          a_idx = string.rindex(a)
          b_idx = string.rindex(b)
          if a_idx > b_idx:
          a_idx = string.rindex(a)
          b_idx = string.rindex(b)
          if a_idx > b_idx:
@@ -241,4 +272,4 @@ def normalize_punc(string):
          # Single stop, but no decimal - probably grouping.
          string = string.replace(u".", u"")
  
          # Single stop, but no decimal - probably grouping.
          string = string.replace(u".", u"")
  
-    return string
+    return string or "NaN"
diff --git a/setup.py b/setup.py

index 111cfac..99a13af 100755 (executable)
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,7 @@ else:
      libraries = ['icui18n', 'icuuc', 'icudata']
  
  setup(name='collate',
      libraries = ['icui18n', 'icuuc', 'icudata']
  
  setup(name='collate',
-      version='0',
+      version='0.1',
        author="Joe Wreschnig",
        author_email="joe.wreschnig@gmail.com",
        description="Python text collation",
        author="Joe Wreschnig",
        author_email="joe.wreschnig@gmail.com",
        description="Python text collation",
@@ -22,4 +22,14 @@ setup(name='collate',
                      libraries=libraries)],
        cmdclass=dict(build_ext=build_ext),
        packages=["collate", "collate.icu"],
                      libraries=libraries)],
        cmdclass=dict(build_ext=build_ext),
        packages=["collate", "collate.icu"],
+      long_description="""\
+This module provides tools to sort strings in a 'human-expected'
+order. Because human expectations are fuzzy and often
+self-contradictory, the sort order is not guaranteed to be stable
+between versions of this module (rather the opposite - the primary
+reason to update it will probably be changed sort results). If
+available, this module uses the ICU localization library. Otherwise,
+it uses the system's locale database (and produces significantly worse
+results).
+"""
        )
        )
author	Joe Wreschnig <joe.wreschnig@gmail.com>
	Tue, 23 Feb 2010 04:01:12 +0000 (20:01 -0800)
committer	Joe Wreschnig <joe.wreschnig@gmail.com>
	Tue, 23 Feb 2010 04:01:12 +0000 (20:01 -0800)
README.txt		patch \| blob \| history
collate/__init__.py		patch \| blob \| history
collate/_locale.py		patch \| blob \| history
collate/errors.py		patch \| blob \| history
collate/strings.py		patch \| blob \| history
setup.py		patch \| blob \| history