X-Git-Url: https://git.korewanetadesu.com/?p=python-collate.git;a=blobdiff_plain;f=collate%2Fstrings.py;h=e2750d19b40175a35f588a225db614d932cc9559;hp=8d6af993cd72d5f0fcb4fc04eb6e1953f72904c9;hb=16cf8c5be127cde59d8ce709a04c64b9f82f00eb;hpb=e8b221311bf60883d3f14083bab63fd947d34509 diff --git a/collate/strings.py b/collate/strings.py index 8d6af99..e2750d1 100644 --- a/collate/strings.py +++ b/collate/strings.py @@ -1,6 +1,6 @@ """String utility functions for collation.""" -__all__ = ["sortemes", "numeric", "normalize_number"] +__all__ = ["sortemes", "numeric", "normalize_number", "deroman"] import unicodedata @@ -14,12 +14,32 @@ CONTINUE_ON = frozenset([ UNKNOWN, LETTER, NUMBER = range(3) -BREAKER = u"\u2029" # Paragraph break character +BREAKER = u"\u2028" # Line break character +HBREAKER = u"\u2029" # Paragraph break character INFINITY = float('inf') KEEP_IN_NUMBERS = u"'.," ALLOWED_IN_NUMBERS = KEEP_IN_NUMBERS + u"_" +ROMAN = { + u"i": 1, + u"v": 5, + u"x": 10, + u"l": 50, + u"c": 100, + u"d": 500, + u"m": 1000, + u"\u2180": 1000, + u"\u2181": 5000, + u"\u2182": 10000, + u"\u2183": 100, + u"\u2184": 100, + u"\u2185": 6, + u"\u2186": 50, + u"\u2187": 50000, + u"\u2188": 100000, + } + def stripends(word): """Strip punctuation and symbols from the ends of a string.""" while word and unicodedata.category(word[0])[0] in "PS": @@ -81,6 +101,8 @@ def sortemes(string, key=lambda s: s): elif category[0] == "N": digits.append(uchar) if letters: + if unicodedata.category(letters[-1])[0] == "L": + letters.append(HBREAKER) aletters(u"".join(letters)) letters = [] previous = LETTER @@ -151,6 +173,10 @@ def numeric(orig, invalid=INFINITY): else: return invalid + for char in string: + if u"\u2160" <= char <= u"\u2188": + return deroman(string) + mult = 1 while string[:1] == u"-" or string[:1] == u"+": if string[:1] == u"-": @@ -273,3 +299,20 @@ def normalize_number(string): string = string.replace(u".", u"") return string or "NaN" + +def deroman(string): + """Turn a Roman numeral into an integer.""" + string = unicodedata.normalize('NFKD', unicode(string)).lower() + previous = 0 + building = 0 + for char in reversed(string): + try: + value = ROMAN[char] + except KeyError: + continue + if value < previous: + building -= value + else: + building += value + previous = value + return building