Collator.lstripwords: Strip words off the start and append to the end.

author Joe Wreschnig <joe.wreschnig@gmail.com>

Fri, 26 Feb 2010 04:49:40 +0000 (20:49 -0800)

committer Joe Wreschnig <joe.wreschnig@gmail.com>

Fri, 26 Feb 2010 04:49:40 +0000 (20:49 -0800)
author Joe Wreschnig <joe.wreschnig@gmail.com>
Fri, 26 Feb 2010 04:49:40 +0000 (20:49 -0800)
committer Joe Wreschnig <joe.wreschnig@gmail.com>
Fri, 26 Feb 2010 04:49:40 +0000 (20:49 -0800)
diff --git a/NEWS.txt b/NEWS.txt

index 823498d..c2cbbc0 100644 (file)
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,6 +1,8 @@
  2010 - 0.2 - Dedicated to ２ (U+FF12)
   - 直感～ before 直感２～.
   - Detect Unicode Roman numeral codepoints and sort them numerically.
+ - lstripwords and lstripsortemekey methods, e.g. sort "A Perfect Sky"
+   as "Perfect Sky, A".
  
  2010.02.22 - 0.1 - Dedicated to 勘 (U+52D8)
   - Initial release.
diff --git a/collate/__init__.py b/collate/__init__.py

index 5094071..ea1516a 100644 (file)
--- a/collate/__init__.py
+++ b/collate/__init__.py
@@ -52,7 +52,7 @@ try:
  except ImportError:
      pass
  
-VERSION = (0, 1)
+VERSION = (0, 2)
  VERSION_STRING = ".".join(map(str, VERSION))
  
  collator = None
diff --git a/collate/_abcollator.py b/collate/_abcollator.py

index 622766d..dd7ea14 100644 (file)
--- a/collate/_abcollator.py
+++ b/collate/_abcollator.py
@@ -1,5 +1,7 @@
  """Abstract base collator."""
  
+import re
+
  import collate.strings
  
  class Collator(object):
@@ -27,15 +29,11 @@ class Collator(object):
          only meaningful when compared to other sort keys from the same
          collator.
          """
-        if isinstance(string, str):
-            string = string.decode(self.encoding, 'replace')
-        return string
+        return self.unicode(string)
  
-    def words(self, string):
-        """Split the string along word boundries."""
-        if isinstance(string, str):
-            string = string.decode(self.encoding, 'replace')
-        return string.split()
+    def words(self, string, sep=re.compile(r"(\s+)", re.UNICODE)):
+        """Split the string into separate words."""
+        return re.split(sep, self.unicode(string))
  
      def sortemekey(self, string):
          """Return a key based on sortemes of a string.
@@ -47,11 +45,49 @@ class Collator(object):
          numbers if 'too much' punctuation exists in between, between
          lines.
          """
-        if isinstance(string, str):
-            string = string.decode(self.encoding, 'replace')
-
+        string = self.unicode(string)
          # Shove the sortkeyed original string on the end to resolve
          # ties intelligently.
          return (collate.strings.sortemes(string, self.key),
                  self.key(string))
  
+    def unicode(self, string):
+        """Convert a str to a unicode using the collator encoding."""
+        try:
+            return unicode(string)
+        except UnicodeError:
+            return string.decode(self.encoding, 'replace')
+
+    def str(self, string):
+        """Convert a unicode to a str using the collator encoding."""
+        try:
+            return str(string)
+        except UnicodeError:
+            return string.encode(self.encoding, 'replace')
+
+    def lstripwords(
+        self, string, strip=collate.strings.INITIAL_STOPS, append=u", "):
+        """Strip words and whitespace from the start of a string.
+
+        If append is not empty, it and the words stripped from the
+        front are appended to the end.
+        """
+        string = self.unicode(string)
+        stripped = []
+        words = self.words(string)
+        while words and (words[0].isspace() or words[0].lower() in strip):
+            stripped.append(words.pop(0))
+        while stripped and stripped[-1].isspace():
+            stripped.pop()
+        if append and stripped:
+            if words:
+                words.append(append)
+            words.extend(stripped)
+        return u"".join(words)
+
+    def lstripsortemekey(
+        self, string, strip=collate.strings.INITIAL_STOPS, append=u", "):
+        """Return a key based on sortemes of a prefix-stripped string."""
+        string = self.unicode(string)
+        stripped = self.lstripwords(string, strip, append)
+        return (self.sortemekey(stripped), self.key(string))
diff --git a/collate/icu/__init__.py b/collate/icu/__init__.py

index e7dfe2d..6d27a6f 100644 (file)
--- a/collate/icu/__init__.py
+++ b/collate/icu/__init__.py
@@ -41,10 +41,8 @@ class Collator(collate._abcollator.Collator):
  
      def words(self, string):
          """Split the string along word boundries."""
-        if isinstance(string, str):
-            string = string.decode(self.encoding)
-        words = self._breaker.words(string)
-        return [w for w in words if not w.isspace()]
+        string = self.unicode(string)
+        return self._breaker.words(string)
  
      def key(self, string):
          """Sort key for a string.
@@ -53,7 +51,6 @@ class Collator(collate._abcollator.Collator):
          instance according to the 'encoding' attribute of the
          Collator.
          """
-        if isinstance(string, str):
-            string = string.decode(self.encoding, 'replace')
+        string = self.unicode(string)
          return self._collator.key(string)
  
diff --git a/collate/strings.py b/collate/strings.py

index 487257f..16ba8be 100644 (file)
--- a/collate/strings.py
+++ b/collate/strings.py
@@ -40,6 +40,8 @@ ROMAN = {
      u"\u2188": 100000,
      }
  
+INITIAL_STOPS = frozenset([u"a", u"an", u"the"])
+
  def stripends(word):
      """Strip punctuation and symbols from the ends of a string."""
      while word and unicodedata.category(word[0])[0] in "PS":
diff --git a/collate/syslocale.py b/collate/syslocale.py

index 5b8adca..e48ee82 100644 (file)
--- a/collate/syslocale.py
+++ b/collate/syslocale.py
@@ -25,7 +25,6 @@ Avoid this backend if...
  __all__ = ["Collator"]
  
  import locale
-import re
  
  import collate.errors
  import collate._abcollator
@@ -54,12 +53,5 @@ class Collator(collate._abcollator.Collator):
          """
          try:
              return locale.strxfrm(string)
-        except UnicodeEncodeError:
-            return locale.strxfrm(string.encode(self.encoding, "replace"))
-
-    def words(self, string, sep=re.compile(r"\W+", re.UNICODE)):
-        """Split the string into separate words."""
-        if isinstance(string, str):
-            string = string.decode(self.encoding, 'replace')
-        return re.split(sep, string)
-
+        except UnicodeError:
+            return locale.strxfrm(string.str(self.encoding))
diff --git a/setup.py b/setup.py

index 99a13af..e47d620 100755 (executable)
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,7 @@ else:
      libraries = ['icui18n', 'icuuc', 'icudata']
  
  setup(name='collate',
-      version='0.1',
+      version='0.2',
        author="Joe Wreschnig",
        author_email="joe.wreschnig@gmail.com",
        description="Python text collation",
author	Joe Wreschnig <joe.wreschnig@gmail.com>
	Fri, 26 Feb 2010 04:49:40 +0000 (20:49 -0800)
committer	Joe Wreschnig <joe.wreschnig@gmail.com>
	Fri, 26 Feb 2010 04:49:40 +0000 (20:49 -0800)
NEWS.txt		patch \| blob \| history
collate/__init__.py		patch \| blob \| history
collate/_abcollator.py		patch \| blob \| history
collate/icu/__init__.py		patch \| blob \| history
collate/strings.py		patch \| blob \| history
collate/syslocale.py		patch \| blob \| history
setup.py		patch \| blob \| history