strings: Include deroman in import list.

[python-collate.git] / collate / strings.py
diff --git a/collate/strings.py b/collate/strings.py

index 5badc8c..e2750d1 100644 (file)
--- a/collate/strings.py
+++ b/collate/strings.py
@@ -1,3 +1,7 @@
+"""String utility functions for collation."""
+
+__all__ = ["sortemes", "numeric", "normalize_number", "deroman"]
+
  import unicodedata
  
  CONTINUE_ON = frozenset([
@@ -10,9 +14,40 @@ CONTINUE_ON = frozenset([
  
  UNKNOWN, LETTER, NUMBER = range(3)
  
-BREAKER = u"\u2029" # Paragraph break character
+BREAKER = u"\u2028" # Line break character
+HBREAKER = u"\u2029" # Paragraph break character
  INFINITY = float('inf')
  
+KEEP_IN_NUMBERS = u"'.,"
+ALLOWED_IN_NUMBERS = KEEP_IN_NUMBERS + u"_"
+
+ROMAN = {
+    u"i": 1,
+    u"v": 5,
+    u"x": 10,
+    u"l": 50,
+    u"c": 100,
+    u"d": 500,
+    u"m": 1000,
+    u"\u2180": 1000,
+    u"\u2181": 5000,
+    u"\u2182": 10000,
+    u"\u2183": 100,
+    u"\u2184": 100,
+    u"\u2185": 6,
+    u"\u2186": 50,
+    u"\u2187": 50000,
+    u"\u2188": 100000,
+    }
+
+def stripends(word):
+    """Strip punctuation and symbols from the ends of a string."""
+    while word and unicodedata.category(word[0])[0] in "PS":
+        word = word[1:]
+    while word and unicodedata.category(word[-1])[0] in "PS":
+        word = word[:-1]
+    return word
+
  def sortemes(string, key=lambda s: s):
      """Generate a list of sortemes for the string.
  
@@ -24,6 +59,7 @@ def sortemes(string, key=lambda s: s):
  
      There is no formal specification for sortemes; the goal of this
      function is to provide good output for Collator.sortemekey.
+
      """
  
      words = []
@@ -35,21 +71,16 @@ def sortemes(string, key=lambda s: s):
      categories = map(unicodedata.category, string)
      previous = UNKNOWN
  
-    def stripends(word):
-        while word and unicodedata.category(word[0])[0] in "PS":
-            word = word[1:]
-        while word and unicodedata.category(word[-1])[0] in "PS":
-            word = word[:-1]
-        return word
-
      def aletters(letters):
+        """Add a group of letters to the word list."""
          words.append((INFINITY, stripends(letters)))
      def adigits(digits):
+        """Add a group of digits to the word list."""
          words.append((numeric(digits), u''))
  
      # TODO(jfw): This kind of evolved over time, there's probably a much
      # faster / more concise way to express it now.
-    for i, (c, category) in enumerate(zip(string, categories)):
+    for i, (uchar, category) in enumerate(zip(string, categories)):
  
          if letters and previous == LETTER and words:
              word = stripends(words.pop()[1].strip()) + BREAKER
@@ -59,7 +90,7 @@ def sortemes(string, key=lambda s: s):
          # Split at the first letter following a number or
          # non-continuing character.
          if category[0] == "L":
-            letters.append(c)
+            letters.append(uchar)
              if digits:
                  adigits(u"".join(digits).strip())
                  digits = []
@@ -68,14 +99,16 @@ def sortemes(string, key=lambda s: s):
          # Split at the first number following a non-number or
          # non-continuing character.
          elif category[0] == "N":
-            digits.append(c)
+            digits.append(uchar)
              if letters:
+                if unicodedata.category(letters[-1])[0] == "L":
+                    letters.append(HBREAKER)
                  aletters(u"".join(letters))
                  letters = []
                  previous = LETTER
  
          # Only certain punctuation allowed in numbers.
-        elif digits and c not in "',._":
+        elif digits and uchar not in ALLOWED_IN_NUMBERS:
              adigits(u"".join(digits))
              digits = []
              previous = NUMBER
@@ -105,9 +138,9 @@ def sortemes(string, key=lambda s: s):
  
          else:
              if digits:
-                digits.append(c)
+                digits.append(uchar)
              elif letters:
-                letters.append(c)
+                letters.append(uchar)
  
      if letters and previous == LETTER and words:
          word = stripends(words.pop()[1].strip()) + BREAKER
@@ -122,16 +155,28 @@ def sortemes(string, key=lambda s: s):
      return [(i, key(w) if w else u'') for i, w in words]
  
  def numeric(orig, invalid=INFINITY):
+    """Parse a number out of a string.
+
+    This function parses a unicode number out of the start of a
+    string. If a number cannot be found at the start, the 'invalid'
+    argument is returned.
+        
+    """
+
      if not orig:
          return invalid
  
      string = unicode(orig)
-    for c in string:
-        if c.isnumeric():
+    for uchar in string:
+        if uchar.isnumeric():
              break
      else:
          return invalid
  
+    for char in string:
+        if u"\u2160" <= char <= u"\u2188":
+            return deroman(string)
+
      mult = 1
      while string[:1] == u"-" or string[:1] == u"+":
          if string[:1] == u"-":
@@ -139,18 +184,18 @@ def numeric(orig, invalid=INFINITY):
          string = string[1:]
  
      if not string[:1].isnumeric():
-        return (invalid, orig)
+        return invalid
  
-    string = normalize_punc(string)
+    string = normalize_number(string)
  
-    # Otherwise we need to do this the hard way.
      def _numeric(string):
+        """Interpreter a number as base 10."""
          total = 0
-        for c in string:
-            v = unicodedata.numeric(c)
-            if v >= 1 or v == 0:
+        for uchar in string:
+            number = unicodedata.numeric(uchar)
+            if number >= 1 or number == 0:
                  total *= 10
-            total += v
+            total += number
          return total
  
      try:
@@ -161,9 +206,21 @@ def numeric(orig, invalid=INFINITY):
      except ValueError:
          return mult * _numeric(string)
  
-def normalize_punc(string):
-    string = unicode(string.strip(u",.'"))
-    string = filter(lambda u: u.isnumeric() or u in u",.'", string)
+def normalize_number(string):
+    """Normalize punctuation in a number.
+
+    This function attempts to guess which characters in a number
+    represent grouping separators and which represent decimal
+    points. It returns a string that is valid to pass to Python's
+    float() routine (potentially, NaN, if nothing like a number is
+    found).
+
+    """
+
+    string = unicode(string)
+    string = filter(lambda u: u.isnumeric() or u in KEEP_IN_NUMBERS, string)
+    string = string.strip(KEEP_IN_NUMBERS)
+
      commas = string.count(u",")
      stops = string.count(u".")
      quotes = string.count(u"'")
@@ -180,7 +237,7 @@ def normalize_punc(string):
          quotes = 0
  
      def normalize_two(a, b, string):
-        # One of each - assume the first is grouping, second is point.
+        """One of each - assume the first is grouping, second is point."""
          a_idx = string.rindex(a)
          b_idx = string.rindex(b)
          if a_idx > b_idx:
@@ -241,4 +298,21 @@ def normalize_punc(string):
          # Single stop, but no decimal - probably grouping.
          string = string.replace(u".", u"")
  
-    return string
+    return string or "NaN"
+
+def deroman(string):
+    """Turn a Roman numeral into an integer."""
+    string = unicodedata.normalize('NFKD', unicode(string)).lower()
+    previous = 0
+    building = 0
+    for char in reversed(string):
+        try:
+            value = ROMAN[char]
+        except KeyError:
+            continue
+        if value < previous:
+            building -= value
+        else:
+            building += value
+        previous = value
+    return building