strings: Include deroman in import list.

[python-collate.git] / collate / strings.py
diff --git a/collate/strings.py b/collate/strings.py

index 8d6af99..e2750d1 100644 (file)
--- a/collate/strings.py
+++ b/collate/strings.py
@@ -1,6 +1,6 @@
  """String utility functions for collation."""
  
-__all__ = ["sortemes", "numeric", "normalize_number"]
+__all__ = ["sortemes", "numeric", "normalize_number", "deroman"]
  
  import unicodedata
  
@@ -14,12 +14,32 @@ CONTINUE_ON = frozenset([
  
  UNKNOWN, LETTER, NUMBER = range(3)
  
-BREAKER = u"\u2029" # Paragraph break character
+BREAKER = u"\u2028" # Line break character
+HBREAKER = u"\u2029" # Paragraph break character
  INFINITY = float('inf')
  
  KEEP_IN_NUMBERS = u"'.,"
  ALLOWED_IN_NUMBERS = KEEP_IN_NUMBERS + u"_"
  
+ROMAN = {
+    u"i": 1,
+    u"v": 5,
+    u"x": 10,
+    u"l": 50,
+    u"c": 100,
+    u"d": 500,
+    u"m": 1000,
+    u"\u2180": 1000,
+    u"\u2181": 5000,
+    u"\u2182": 10000,
+    u"\u2183": 100,
+    u"\u2184": 100,
+    u"\u2185": 6,
+    u"\u2186": 50,
+    u"\u2187": 50000,
+    u"\u2188": 100000,
+    }
+
  def stripends(word):
      """Strip punctuation and symbols from the ends of a string."""
      while word and unicodedata.category(word[0])[0] in "PS":
@@ -81,6 +101,8 @@ def sortemes(string, key=lambda s: s):
          elif category[0] == "N":
              digits.append(uchar)
              if letters:
+                if unicodedata.category(letters[-1])[0] == "L":
+                    letters.append(HBREAKER)
                  aletters(u"".join(letters))
                  letters = []
                  previous = LETTER
@@ -151,6 +173,10 @@ def numeric(orig, invalid=INFINITY):
      else:
          return invalid
  
+    for char in string:
+        if u"\u2160" <= char <= u"\u2188":
+            return deroman(string)
+
      mult = 1
      while string[:1] == u"-" or string[:1] == u"+":
          if string[:1] == u"-":
@@ -273,3 +299,20 @@ def normalize_number(string):
          string = string.replace(u".", u"")
  
      return string or "NaN"
+
+def deroman(string):
+    """Turn a Roman numeral into an integer."""
+    string = unicodedata.normalize('NFKD', unicode(string)).lower()
+    previous = 0
+    building = 0
+    for char in reversed(string):
+        try:
+            value = ROMAN[char]
+        except KeyError:
+            continue
+        if value < previous:
+            building -= value
+        else:
+            building += value
+        previous = value
+    return building