Minor tweaks for better numeric-following-split-strings.

[python-collate.git] / collate / _strings.py
diff --git a/collate/_strings.py b/collate/_strings.py

index 6bcfd9c..aed2ba7 100644 (file)
--- a/collate/_strings.py
+++ b/collate/_strings.py
@@ -1,84 +1,175 @@
-def alnumsplit(string):
+import unicodedata
+
+CONTINUE_ON = frozenset([
+    "Ll", "Lm", "Lo", "Lt", "Lu",
+    "Mc", "Me", "Mn",
+    "Nd", "Nl", "No",
+    "Po",
+    "Zs",
+    ])
+
+UNKNOWN, LETTER, NUMBER = range(3)
+
+def sortemes(string):
+    """Generate a list of sortemes for the string.
+
+    A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
+    sort information. This is larger than a word boundry but smaller
+    than a sentence boundry; roughly, a sorteme boundry occurs between
+    letters and numbers, between numbers and numbrs if 'too much'
+    punctuation exists in between, between lines.
+
+    There is no formal specification for sortemes; the goal of this
+    function is to provide good output for Collator.sortemekey.
+    """
+
+    words = []
+    if not string:
+        return words
      string = unicode(string)
-    strings = []
-    word = []
-    numeric = None
-    for char in string:
-        if numeric is None:
-            broke = False
-            if char.isnumeric():
-                numeric = True
-            elif char.isalpha():
-                numeric = False
-        elif numeric and char.isalpha():
+    start = None
+    last = None
+    mode = UNKNOWN
+    previous_mode = UNKNOWN
+    category = "XX"
+    for i, c in enumerate(string):
+        broke = False
+        prev_category = category
+        this_mode = mode
+        category = unicodedata.category(c)
+
+        # Split at the first letter following a number or
+        # non-continuing character.
+        if category[0] == "L":
+            if mode != LETTER:
+                broke = True
+                mode = LETTER
+
+        # Split at the first number following a non-number or
+        # non-continuing character.
+        elif category[0] == "N":
+            if mode != NUMBER:
+                broke = True
+                mode = NUMBER
+
+        # Split if we find a non-continuing character ("weird" ones).
+        elif category not in CONTINUE_ON:
+            broke = True
+            mode = UNKNOWN
+
+        # Only certain punctuation allowed in numbers.
+        elif mode == NUMBER and category[0] == "P" and c not in "',._":
              broke = True
-        elif not numeric and char.isnumeric():
+            mode = UNKNOWN
+
+        # Split if we find two pieces of punctuation in a row, even
+        # if we should otherwise continue.
+        elif i > 0 and prev_category[0] == "P" and category[0] == "P":
              broke = True
+            mode = UNKNOWN
+
+        if broke and start is not None and last is not None:
+            # If we read two strings separated by weird punctuation,
+            # pretend the punctuation isn't there.
+            if (this_mode == previous_mode == LETTER
+                and (category[0] == "P" or prev_category[0] == "P")
+                and words):
+                words[-1] += u" " + string[start:last+1]
+            else:
+                # This ensures "foo2 bar" sorts as ["foo ", 2, "bar"]
+                # Which sorts after ["foo", "bar"].
+                if this_mode == NUMBER and previous_mode == LETTER and words:
+                    words[-1] += u" "
+                words.append(string[start:last+1])
+                previous_mode = this_mode
+
          if broke:
-            if word:
-                strings.append("".join(word))
-                word = []
-            numeric = None
-        word.append(char)
-    if word:
-        strings.append("".join(word))
-    return strings
-
-def wordlike(string):
-    """Check if a string is 'word-like'.
-
-    Word-like strings contain at least one alphanumeric character.
-    """
-    return any(map(type(string).isalnum, string))
+            start = i
+            last = None
+        if category[0] in "LN":
+            last = i
+    this_mode = mode
+    if start is not None and last is not None:
+        if this_mode == LETTER and previous_mode == LETTER and words:
+            words[-1] += u" " + string[start:last+1]
+        else:
+            if this_mode == NUMBER and previous_mode == LETTER and words:
+                words[-1] += u" "
+            words.append(string[start:last+1])
+    return words
  
-def numeric(string, invalid=float('inf')):
-    string = unicode(string)
-    if not any(map(type(string).isnumeric, string)):
-        return (invalid, string)
-    if not string:
+def numeric(orig, invalid=float('inf')):
+    if not orig:
          return (invalid, '')
  
+    string = unicode(orig)
+    for c in string:
+        if c.isnumeric():
+            break
+    else:
+        return (invalid, orig)
+
      mult = 1
-    while string[:1] == "-" or string[:1] == "+":
-        if string[0] == "-":
+    while string[:1] == u"-" or string[:1] == u"+":
+        if string[:1] == u"-":
              mult = -mult
          string = string[1:]
  
-    # Maybe we got lucky and this is a trivial case...
+    if not string[:1].isnumeric():
+        return (invalid, orig)
+
+    string = normalize_punc(string)
+
+    # Early out if possible.
      try:
-        return float(string) * mult
+        return (float(string) * mult, orig)
      except ValueError:
          pass
  
      # Otherwise we need to do this the hard way.
-    return mult * float(normalize_dots(string))
+    def _numeric(string):
+        total = 0
+        for c in string:
+            v = unicodedata.numeric(c)
+            if v >= 1 or v == 0:
+                total *= 10
+            total += v
+        return total
  
-def normalize_dots(string):
-    string = unicode(string.strip(",.'"))
-    string = filter(lambda u: u.isnumeric() or u in ",.'", string)
-    commas = string.count(",")
-    stops = string.count(".")
-    quotes = string.count("'")
+    try:
+        whole, frac = string.split(".")
+        whole = _numeric(whole)
+        frac = _numeric(frac) / (10.0 ** len(frac))
+        return (mult * (whole + frac), orig)
+    except ValueError:
+        return (mult * _numeric(string), orig)
+
+def normalize_punc(string):
+    string = unicode(string.strip(u",.'"))
+    string = filter(lambda u: u.isnumeric() or u in u",.'", string)
+    commas = string.count(u",")
+    stops = string.count(u".")
+    quotes = string.count(u"'")
  
      # If anything occurs more than once, it's a separator.
      if commas > 1:
-        string = string.replace(",", "")
+        string = string.replace(u",", u"")
          commas = 0
      if stops > 1:
-        string = string.replace(".", "")
+        string = string.replace(u".", u"")
          stops = 0
      if quotes > 1:
-        string = string.replace("'", "")
+        string = string.replace(u"'", u"")
          quotes = 0
  
-    def normalize_two(a, b):
+    def normalize_two(a, b, string):
          # One of each - assume the first is grouping, second is point.
          a_idx = string.rindex(a)
          b_idx = string.rindex(b)
          if a_idx > b_idx:
-            string = string.replace(b, "").replace(a, ".")
+            string = string.replace(b, u"").replace(a, u".")
          else:
-            string = string.replace(a, "").replace(b, ".")
+            string = string.replace(a, u"").replace(b, u".")
          return string
  
      if commas and stops and quotes:
@@ -90,37 +181,47 @@ def normalize_dots(string):
          # Not really valid, so do whatever we want...
          # A'AAA.BB,CC
          # A'AAA,BB.CC
-        comma_idx = string.index(",")
-        stops_idx = string.index(".")
-        quotes_idx = string.index("'")
+        comma_idx = string.index(u",")
+        stops_idx = string.index(u".")
+        quotes_idx = string.index(u"'")
          if (comma_idx < stops_idx < quotes_idx
              or quotes_idx < stops_idx < comma_idx):
-            string = string.replace(",", "").replace("'", "")
+            string = string.replace(u",", u"").replace(u"'", u"")
          elif (comma_idx < quotes_idx < stops_idx
              or stops_idx < quotes_idx < comma_idx):
-            string = string.replace(",", "").replace(".", "").replace("'", ".")
+            string = string.replace(
+                u",", u"").replace(
+                u".", u"").replace(
+                u"'", u".")
          else:
-            string = string.replace("'", "").replace(".", "").replace(",", ".")
+            string = string.replace(
+                u"'", u"").replace(
+                u".", u"").replace(
+                u",", u".")
  
      elif stops and quotes:
-        string = normalize_two('.', "'")
+        string = normalize_two(u".", u"'", string)
  
      elif commas and quotes:
-        string = normalize_two(',', "'")
+        string = normalize_two(u",", u"'", string)
  
      elif commas and stops:
-        string = normalize_two(',', '.')
+        string = normalize_two(u",", u".", string)
  
      elif commas:
-        if string[-4:-3] == "," and len(string) <= 7:
+        if string[-4:-3] == u"," and len(string) <= 7:
              # Single comma as a thousands separator.
-            string = string.replace(",", "")
+            string = string.replace(u",", u"")
          else:
              # Single comma, not thousands - probably a decimal point.
-            string = string.replace(",", ".")
+            string = string.replace(u",", u".")
  
      elif quotes:
          # Single quote, probably MM'SS", equivalent to a decimal point.
-        string = string.replace("'", ".")
+        string = string.replace(u"'", u".")
+
+    elif stops and string[-4:] == ".000":
+        # Single stop, but no decimal - probably grouping.
+        string = string.replace(u".", u"")
  
      return string