Different algorithm, gives better results for numbers following grouping numbres...

author Joe Wreschnig <joe.wreschnig@gmail.com>

Mon, 22 Feb 2010 04:28:46 +0000 (20:28 -0800)

committer Joe Wreschnig <joe.wreschnig@gmail.com>

Mon, 22 Feb 2010 04:28:46 +0000 (20:28 -0800)
author Joe Wreschnig <joe.wreschnig@gmail.com>
Mon, 22 Feb 2010 04:28:46 +0000 (20:28 -0800)
committer Joe Wreschnig <joe.wreschnig@gmail.com>
Mon, 22 Feb 2010 04:28:46 +0000 (20:28 -0800)
diff --git a/collate/strings.py b/collate/strings.py

index 267c6e5..5717246 100644 (file)
--- a/collate/strings.py
+++ b/collate/strings.py
@@ -10,7 +10,7 @@ CONTINUE_ON = frozenset([
  
  UNKNOWN, LETTER, NUMBER = range(3)
  
  
  UNKNOWN, LETTER, NUMBER = range(3)
  
-BREAKER = u"\u2029"
+BREAKER = u"\u2029" # Paragraph break character
  
  def sortemes(string):
      """Generate a list of sortemes for the string.
  
  def sortemes(string):
      """Generate a list of sortemes for the string.
@@ -26,77 +26,97 @@ def sortemes(string):
      """
  
      words = []
      """
  
      words = []
+    letters = []
+    digits = []
      if not string:
          return words
      string = unicode(string)
      if not string:
          return words
      string = unicode(string)
-    start = None
-    last = None
-    mode = UNKNOWN
-    previous_mode = UNKNOWN
-    category = "XX"
+    categories = map(unicodedata.category, string)
+    previous = UNKNOWN
+    types = []
+
+    def stripends(word):
+        while word and unicodedata.category(word[0])[0] in "PS":
+            word = word[1:]
+        while word and unicodedata.category(word[-1])[0] in "PS":
+            word = word[:-1]
+        return word
  
      # TODO(jfw): This kind of evolved over time, there's probably a much
      # faster / more concise way to express it now.
  
      # TODO(jfw): This kind of evolved over time, there's probably a much
      # faster / more concise way to express it now.
-    for i, c in enumerate(string):
-        broke = False
-        prev_category = category
-        this_mode = mode
-        category = unicodedata.category(c)
+    for i, (c, category) in enumerate(zip(string, categories)):
+
+        if letters and previous == LETTER and words:
+            word = stripends(words.pop().strip())
+            letters = list(stripends(word).strip() + BREAKER) + letters
+            previous = UNKNOWN
  
          # Split at the first letter following a number or
          # non-continuing character.
          if category[0] == "L":
  
          # Split at the first letter following a number or
          # non-continuing character.
          if category[0] == "L":
-            if mode != LETTER:
-                broke = True
-                mode = LETTER
+            letters.append(c)
+            if digits:
+                words.append(u"".join(digits).strip())
+                previous = NUMBER
+                digits = []
  
          # Split at the first number following a non-number or
          # non-continuing character.
          elif category[0] == "N":
  
          # Split at the first number following a non-number or
          # non-continuing character.
          elif category[0] == "N":
-            if mode != NUMBER:
-                broke = True
-                mode = NUMBER
-
-        # Split if we find a non-continuing character ("weird" ones).
-        elif category not in CONTINUE_ON:
-            broke = True
-            mode = UNKNOWN
+            digits.append(c)
+            if letters:
+                words.append(u"".join(letters))
+                previous = LETTER
+                letters = []
  
          # Only certain punctuation allowed in numbers.
  
          # Only certain punctuation allowed in numbers.
-        elif mode == NUMBER and category[0] == "P" and c not in "',._":
-            broke = True
-            mode = UNKNOWN
+        elif digits and c not in "',._":
+            words.append(u"".join(digits))
+            previous = NUMBER
+            digits = []
+
+        # Split if we find a non-continuing character ("weird" ones).
+        elif letters and category not in CONTINUE_ON:
+            if letters:
+                words.append(u"".join(letters).strip() + BREAKER)
+                previous = LETTER
+                letters = []
+            if digits:
+                words.append(u"".join(digits).strip() + BREAKER)
+                previous = NUMBER
+                digits = []
  
          # Split if we find two pieces of punctuation in a row, even
          # if we should otherwise continue.
  
          # Split if we find two pieces of punctuation in a row, even
          # if we should otherwise continue.
-        elif prev_category[0] in "P" and category[0] in "P":
-            broke = True
-            mode = UNKNOWN
-
-        if broke and start is not None and last is not None:
-            # If we read two strings separated by weird punctuation,
-            # pretend the punctuation isn't there.
-            if this_mode == previous_mode == LETTER:
-                words[-1] += BREAKER + string[start:last+1]
-            else:
-                if this_mode == NUMBER and previous_mode == LETTER:
-                    words[-1] += BREAKER
-                words.append(string[start:last+1])
-            previous_mode = this_mode
-
-        if broke:
-            start = i
-            last = None
-        if category[0] in "LN":
-            last = i
-    this_mode = mode
-    if start is not None and last is not None:
-        if this_mode == LETTER and previous_mode == LETTER and words:
-            words[-1] += BREAKER + string[start:last+1]
+        elif i and categories[i-1][0] in "P" and category[0] in "P":
+            if letters:
+                words.append(u"".join(letters))
+                previous = LETTER
+                letters = []
+            if digits:
+                words.append(u"".join(digits))
+                previous = NUMBER
+                digits = []
+
          else:
          else:
-            if this_mode == NUMBER and previous_mode == LETTER and words:
-                words[-1] += BREAKER
-            words.append(string[start:last+1])
+            if digits:
+                digits.append(c)
+            elif letters:
+                letters.append(c)
+
+    if letters and previous == LETTER and words:
+        word = stripends(words.pop().strip())
+        letters = list(stripends(word).strip() + BREAKER) + letters
+        previous = UNKNOWN
+
+    if letters:
+        words.append(u"".join(letters))
+        letters = []
+    if digits:
+        words.append(u"".join(digits))
+        digits = []
+
+    words = map(stripends, words)
      return words
  
  def numeric(orig, invalid=float('inf')):
      return words
  
  def numeric(orig, invalid=float('inf')):
diff --git a/tests/en/numbersafternothing.list.txt b/tests/en/numbersafternothing.list.txt

new file mode 100644 (file)

index 0000000..fc38492
--- /dev/null
+++ b/tests/en/numbersafternothing.list.txt
@@ -0,0 +1,6 @@
+Promised land
+Promised Land
+Promised Land (Loren & Mash Studio Live)
+Promised Land (Reprise)
+Promised land 2005
+Promised land 2005 (TETSU P'UNK vocalless version)
author	Joe Wreschnig <joe.wreschnig@gmail.com>
	Mon, 22 Feb 2010 04:28:46 +0000 (20:28 -0800)
committer	Joe Wreschnig <joe.wreschnig@gmail.com>
	Mon, 22 Feb 2010 04:28:46 +0000 (20:28 -0800)
collate/strings.py		patch \| blob \| history
tests/en/numbersafternothing.list.txt	[new file with mode: 0644]	patch \| blob