Cleanup in preparation for release. Add docstrings, remove basically empty _constants...

[python-collate.git] / collate / _abcollator.py
diff --git a/collate/_abcollator.py b/collate/_abcollator.py

index 9dce32b..622766d 100644 (file)
--- a/collate/_abcollator.py
+++ b/collate/_abcollator.py
@@ -1,22 +1,51 @@
+"""Abstract base collator."""
+
  import collate.strings
  
  class Collator(object):
+    """Abstract base class for Collators.
+
+    Attributes:
+    locale - the collator follows rules for this locale
+    encoding - assumed string encoding
+    """
+
+    locale = "C"
+    encoding = "ascii"
+
+    def __init__(self, locale=None, encoding=None):
+        pass
+
      def cmp(self, string1, string2):
          """Return negative if a < b, zero if a == b, positive if a > b."""
          return cmp(self.key(string1), self.key(string2))
  
+    def key(self, string):
+        """Return a good sorting key for the string.
+        
+        The sort key should be considered an opaque value which is
+        only meaningful when compared to other sort keys from the same
+        collator.
+        """
+        if isinstance(string, str):
+            string = string.decode(self.encoding, 'replace')
+        return string
+
      def words(self, string):
          """Split the string along word boundries."""
          if isinstance(string, str):
              string = string.decode(self.encoding, 'replace')
          return string.split()
  
-    def sortemekey(self, string, invalid=float('inf')):
+    def sortemekey(self, string):
          """Return a key based on sortemes of a string.
  
-        If the string is a str instance, it is decoded to a unicode
-        instance according to the 'encoding' attribute of the
-        Collator.
+        A sorteme, by analogy with grapheme/morpheme/etc. is an atom
+        of sort information. This is larger than a word boundry but
+        smaller than a sentence boundry; roughly, a sorteme boundry
+        occurs between letters and numbers, between numbers and
+        numbers if 'too much' punctuation exists in between, between
+        lines.
          """
          if isinstance(string, str):
              string = string.decode(self.encoding, 'replace')