From: Joe Wreschnig Date: Thu, 25 Feb 2010 05:51:56 +0000 (-0800) Subject: Roman numeral parsing. More test cases. (Fixes issue #3) X-Git-Url: https://git.korewanetadesu.com/?p=python-collate.git;a=commitdiff_plain;h=29aa14bcc723a78fb2f172488c65c4893f6797a3 Roman numeral parsing. More test cases. (Fixes issue #3) --- diff --git a/NEWS.txt b/NEWS.txt index 8bb6e70..823498d 100644 --- a/NEWS.txt +++ b/NEWS.txt @@ -1,5 +1,6 @@ 2010 - 0.2 - Dedicated to 2 (U+FF12) - 直感~ before 直感2~. + - Detect Unicode Roman numeral codepoints and sort them numerically. 2010.02.22 - 0.1 - Dedicated to 勘 (U+52D8) - Initial release. diff --git a/collate/strings.py b/collate/strings.py index 60685c5..fd6f71d 100644 --- a/collate/strings.py +++ b/collate/strings.py @@ -21,6 +21,25 @@ INFINITY = float('inf') KEEP_IN_NUMBERS = u"'.," ALLOWED_IN_NUMBERS = KEEP_IN_NUMBERS + u"_" +ROMAN = { + u"i": 1, + u"v": 5, + u"x": 10, + u"l": 50, + u"c": 100, + u"d": 500, + u"m": 1000, + u"\u2180": 1000, + u"\u2181": 5000, + u"\u2182": 10000, + u"\u2183": 100, + u"\u2184": 100, + u"\u2185": 6, + u"\u2186": 50, + u"\u2187": 50000, + u"\u2188": 100000, + } + def stripends(word): """Strip punctuation and symbols from the ends of a string.""" while word and unicodedata.category(word[0])[0] in "PS": @@ -154,6 +173,10 @@ def numeric(orig, invalid=INFINITY): else: return invalid + for char in string: + if u"\u2160" <= char <= u"\u2188": + return deroman(string) + mult = 1 while string[:1] == u"-" or string[:1] == u"+": if string[:1] == u"-": @@ -276,3 +299,20 @@ def normalize_number(string): string = string.replace(u".", u"") return string or "NaN" + +def deroman(string): + """Turn a Roman numeral into an integer.""" + string = unicodedata.normalize('NFKD', unicode(string)).lower() + previous = 0 + building = 0 + for char in reversed(string): + try: + value = ROMAN[char] + except KeyError: + continue + if value < previous: + building -= value + else: + building += value + previous = value + return building diff --git a/tests/en/kanjinum.list.txt b/tests/en/kanjinum.list.txt new file mode 100644 index 0000000..be3f783 --- /dev/null +++ b/tests/en/kanjinum.list.txt @@ -0,0 +1,4 @@ +白玉楼 ~ 幽冥の住人は割と少ない +直感~時として恋は~ (live version) +直感2~逃した魚は大きいぞ!~ +真っ赤なバラとジントニック diff --git a/tests/en/romannumeral.list.txt b/tests/en/romannumeral.list.txt new file mode 100644 index 0000000..00db739 --- /dev/null +++ b/tests/en/romannumeral.list.txt @@ -0,0 +1,10 @@ +ⅰ +ⅱ +ⅲ +ⅳ +ⅴ +ⅵ +ⅴⅰⅰ +ⅹⅹ +ⅿⅰⅹ +ⅿⅹ