Category-based splitting.

author Joe Wreschnig <joe.wreschnig@gmail.com>

Tue, 16 Feb 2010 09:28:22 +0000 (01:28 -0800)

committer Joe Wreschnig <joe.wreschnig@gmail.com>

Tue, 16 Feb 2010 09:28:22 +0000 (01:28 -0800)
author Joe Wreschnig <joe.wreschnig@gmail.com>
Tue, 16 Feb 2010 09:28:22 +0000 (01:28 -0800)
committer Joe Wreschnig <joe.wreschnig@gmail.com>
Tue, 16 Feb 2010 09:28:22 +0000 (01:28 -0800)
diff --git a/collate/_abcollator.py b/collate/_abcollator.py

index 02cb733c79d889462a03e4680ece539e664cb696..2e6ab9567338e48884c6e46778b112e914d229e5 100644 (file)
--- a/collate/_abcollator.py
+++ b/collate/_abcollator.py
@@ -19,5 +19,6 @@ class Collator(object):
          words = []
          for sorteme in self.sortemes(string):
              num, alpha = collate._strings.numeric(sorteme, invalid)
-            words.append((num, self.key(alpha)))
+            alpha = self.key(collate._strings.strip_punc(alpha))
+            words.append((num, alpha))
          return words
diff --git a/collate/_strings.py b/collate/_strings.py

index f81bfd7e20be5bc65529f85e3bccab5fb0f15d60..d872ed4a1bca483944cf6b3a4a86b870562dd314 100644 (file)
--- a/collate/_strings.py
+++ b/collate/_strings.py
@@ -1,9 +1,12 @@
  import unicodedata
  
-def strip_nonalnum(string):
-    while string and not (string[0].isalpha() or string[0].isnumeric()):
+def strip_punc(string):
+    return filter(lambda c: unicodedata.category(c)[0] not in "PS", string)
+
+def strip_ends(string):
+    while string and unicodedata.category(string[0])[0] in "ZPS":
          string = string[1:]
-    while string and not (string[-1].isalpha() or string[-1].isnumeric()):
+    while string and unicodedata.category(string[-1])[0] in "ZPS":
          string = string[:-1]
      return string
  
@@ -15,6 +18,7 @@ def alnumsplit(string):
      numeric = None
      start = 0
      for i, char in enumerate(string):
+        category = unicodedata.category(char)
          if numeric is None:
              broke = False
              if char.isnumeric():
@@ -24,17 +28,17 @@ def alnumsplit(string):
          elif numeric and char.isalpha():
              broke = True
              numeric = False
-        elif numeric and char.isspace():
+        elif numeric and category in ["Zs", "Ps", "Pe"]:
              broke = True
              numeric = None
          elif not numeric and char.isnumeric():
              broke = True
              numeric = True
          if broke:
-            strings.append(strip_nonalnum(string[start:i]))
+            strings.append(strip_ends(string[start:i]))
              start = i
              broke = False
-    strings.append(strip_nonalnum(string[start:i + 1]))
+    strings.append(strip_ends(string[start:i + 1]))
      return strings
  
  def wordlike(string):
@@ -85,7 +89,7 @@ def numeric(orig, invalid=float('inf')):
          total = 0
          for c in string:
              v = unicodedata.numeric(c)
-            if v >= 1:
+            if v >= 1 or v == 0:
                  total *= 10
              total += v
          return total
author	Joe Wreschnig <joe.wreschnig@gmail.com>
	Tue, 16 Feb 2010 09:28:22 +0000 (01:28 -0800)
committer	Joe Wreschnig <joe.wreschnig@gmail.com>
	Tue, 16 Feb 2010 09:28:22 +0000 (01:28 -0800)
collate/_abcollator.py		patch \| blob \| history
collate/_strings.py		patch \| blob \| history