Roman numeral parsing. More test cases. (Fixes issue #3)

author Joe Wreschnig <joe.wreschnig@gmail.com>

Thu, 25 Feb 2010 05:51:56 +0000 (21:51 -0800)

committer Joe Wreschnig <joe.wreschnig@gmail.com>

Thu, 25 Feb 2010 05:51:56 +0000 (21:51 -0800)
author Joe Wreschnig <joe.wreschnig@gmail.com>
Thu, 25 Feb 2010 05:51:56 +0000 (21:51 -0800)
committer Joe Wreschnig <joe.wreschnig@gmail.com>
Thu, 25 Feb 2010 05:51:56 +0000 (21:51 -0800)
diff --git a/NEWS.txt b/NEWS.txt

index 8bb6e70dcef0f1689f57251891e85aa6d1f526c5..823498d392f526f645441ee9546dbd94261ed37f 100644 (file)
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,5 +1,6 @@
  2010 - 0.2 - Dedicated to ２ (U+FF12)
   - 直感～ before 直感２～.
+ - Detect Unicode Roman numeral codepoints and sort them numerically.
  
  2010.02.22 - 0.1 - Dedicated to 勘 (U+52D8)
   - Initial release.
diff --git a/collate/strings.py b/collate/strings.py

index 60685c57a3415e763a63d05545340429113ec222..fd6f71d852fe4c86f9260eb9a5d83b073dbe7e8d 100644 (file)
--- a/collate/strings.py
+++ b/collate/strings.py
@@ -21,6 +21,25 @@ INFINITY = float('inf')
  KEEP_IN_NUMBERS = u"'.,"
  ALLOWED_IN_NUMBERS = KEEP_IN_NUMBERS + u"_"
  
+ROMAN = {
+    u"i": 1,
+    u"v": 5,
+    u"x": 10,
+    u"l": 50,
+    u"c": 100,
+    u"d": 500,
+    u"m": 1000,
+    u"\u2180": 1000,
+    u"\u2181": 5000,
+    u"\u2182": 10000,
+    u"\u2183": 100,
+    u"\u2184": 100,
+    u"\u2185": 6,
+    u"\u2186": 50,
+    u"\u2187": 50000,
+    u"\u2188": 100000,
+    }
+
  def stripends(word):
      """Strip punctuation and symbols from the ends of a string."""
      while word and unicodedata.category(word[0])[0] in "PS":
@@ -154,6 +173,10 @@ def numeric(orig, invalid=INFINITY):
      else:
          return invalid
  
+    for char in string:
+        if u"\u2160" <= char <= u"\u2188":
+            return deroman(string)
+
      mult = 1
      while string[:1] == u"-" or string[:1] == u"+":
          if string[:1] == u"-":
@@ -276,3 +299,20 @@ def normalize_number(string):
          string = string.replace(u".", u"")
  
      return string or "NaN"
+
+def deroman(string):
+    """Turn a Roman numeral into an integer."""
+    string = unicodedata.normalize('NFKD', unicode(string)).lower()
+    previous = 0
+    building = 0
+    for char in reversed(string):
+        try:
+            value = ROMAN[char]
+        except KeyError:
+            continue
+        if value < previous:
+            building -= value
+        else:
+            building += value
+        previous = value
+    return building
diff --git a/tests/en/kanjinum.list.txt b/tests/en/kanjinum.list.txt

new file mode 100644 (file)

index 0000000..be3f783
--- /dev/null
+++ b/tests/en/kanjinum.list.txt
@@ -0,0 +1,4 @@
+白玉楼 ～ 幽冥の住人は割と少ない
+直感～時として恋は～ (live version)
+直感２～逃した魚は大きいぞ！～
+真っ赤なバラとジントニック
diff --git a/tests/en/romannumeral.list.txt b/tests/en/romannumeral.list.txt

new file mode 100644 (file)

index 0000000..00db739
--- /dev/null
+++ b/tests/en/romannumeral.list.txt
@@ -0,0 +1,10 @@
+ⅰ
+ⅱ
+ⅲ
+ⅳ
+ⅴ
+ⅵ
+ⅴⅰⅰ
+ⅹⅹ
+ⅿⅰⅹ
+ⅿⅹ
author	Joe Wreschnig <joe.wreschnig@gmail.com>
	Thu, 25 Feb 2010 05:51:56 +0000 (21:51 -0800)
committer	Joe Wreschnig <joe.wreschnig@gmail.com>
	Thu, 25 Feb 2010 05:51:56 +0000 (21:51 -0800)
NEWS.txt		patch \| blob \| history
collate/strings.py		patch \| blob \| history
tests/en/kanjinum.list.txt	[new file with mode: 0644]	patch \| blob
tests/en/romannumeral.list.txt	[new file with mode: 0644]	patch \| blob