+++ /dev/null
-import unicodedata
-
-CONTINUE_ON = frozenset([
- "Ll", "Lm", "Lo", "Lt", "Lu",
- "Mc", "Me", "Mn",
- "Nd", "Nl", "No",
- "Po",
- "Zs",
- ])
-
-UNKNOWN, LETTER, NUMBER = range(3)
-
-def sortemes(string):
- """Generate a list of sortemes for the string.
-
- A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
- sort information. This is larger than a word boundry but smaller
- than a sentence boundry; roughly, a sorteme boundry occurs between
- letters and numbers, between numbers and numbrs if 'too much'
- punctuation exists in between, between lines.
-
- There is no formal specification for sortemes; the goal of this
- function is to provide good output for Collator.sortemekey.
- """
-
- words = []
- if not string:
- return words
- string = unicode(string)
- start = None
- last = None
- mode = UNKNOWN
- previous_mode = UNKNOWN
- category = "XX"
- for i, c in enumerate(string):
- broke = False
- prev_category = category
- this_mode = mode
- category = unicodedata.category(c)
-
- # Split at the first letter following a number or
- # non-continuing character.
- if category[0] == "L":
- if mode != LETTER:
- broke = True
- mode = LETTER
-
- # Split at the first number following a non-number or
- # non-continuing character.
- elif category[0] == "N":
- if mode != NUMBER:
- broke = True
- mode = NUMBER
-
- # Split if we find a non-continuing character ("weird" ones).
- elif category not in CONTINUE_ON:
- broke = True
- mode = UNKNOWN
-
- # Only certain punctuation allowed in numbers.
- elif mode == NUMBER and category[0] == "P" and c not in "',._":
- broke = True
- mode = UNKNOWN
-
- # Split if we find two pieces of punctuation in a row, even
- # if we should otherwise continue.
- elif i > 0 and prev_category[0] == "P" and category[0] == "P":
- broke = True
- mode = UNKNOWN
-
- if broke and start is not None and last is not None:
- # If we read two strings separated by weird punctuation,
- # pretend the punctuation isn't there.
- if (this_mode == previous_mode == LETTER
- and (category[0] == "P" or prev_category[0] == "P")
- and words):
- words[-1] += u" " + string[start:last+1]
- else:
- # This ensures "foo2 bar" sorts as ["foo ", 2, "bar"]
- # Which sorts after ["foo", "bar"].
- if this_mode == NUMBER and previous_mode == LETTER and words:
- words[-1] += u" "
- words.append(string[start:last+1])
- previous_mode = this_mode
-
- if broke:
- start = i
- last = None
- if category[0] in "LN":
- last = i
- this_mode = mode
- if start is not None and last is not None:
- if this_mode == LETTER and previous_mode == LETTER and words:
- words[-1] += u" " + string[start:last+1]
- else:
- if this_mode == NUMBER and previous_mode == LETTER and words:
- words[-1] += u" "
- words.append(string[start:last+1])
- return words
-
-def numeric(orig, invalid=float('inf')):
- if not orig:
- return (invalid, '')
-
- string = unicode(orig)
- for c in string:
- if c.isnumeric():
- break
- else:
- return (invalid, orig)
-
- mult = 1
- while string[:1] == u"-" or string[:1] == u"+":
- if string[:1] == u"-":
- mult = -mult
- string = string[1:]
-
- if not string[:1].isnumeric():
- return (invalid, orig)
-
- string = normalize_punc(string)
-
- # Early out if possible.
- try:
- return (float(string) * mult, orig)
- except ValueError:
- pass
-
- # Otherwise we need to do this the hard way.
- def _numeric(string):
- total = 0
- for c in string:
- v = unicodedata.numeric(c)
- if v >= 1 or v == 0:
- total *= 10
- total += v
- return total
-
- try:
- whole, frac = string.split(".")
- whole = _numeric(whole)
- frac = _numeric(frac) / (10.0 ** len(frac))
- return (mult * (whole + frac), orig)
- except ValueError:
- return (mult * _numeric(string), orig)
-
-def normalize_punc(string):
- string = unicode(string.strip(u",.'"))
- string = filter(lambda u: u.isnumeric() or u in u",.'", string)
- commas = string.count(u",")
- stops = string.count(u".")
- quotes = string.count(u"'")
-
- # If anything occurs more than once, it's a separator.
- if commas > 1:
- string = string.replace(u",", u"")
- commas = 0
- if stops > 1:
- string = string.replace(u".", u"")
- stops = 0
- if quotes > 1:
- string = string.replace(u"'", u"")
- quotes = 0
-
- def normalize_two(a, b, string):
- # One of each - assume the first is grouping, second is point.
- a_idx = string.rindex(a)
- b_idx = string.rindex(b)
- if a_idx > b_idx:
- string = string.replace(b, u"").replace(a, u".")
- else:
- string = string.replace(a, u"").replace(b, u".")
- return string
-
- if commas and stops and quotes:
- # If all three, assume the middle is the decimal point.
- # A,AAA.BB'CC
- # A.AAA,BB'CC
- # A,AAA'BB.CC
- # A.AAA'BB,CC
- # Not really valid, so do whatever we want...
- # A'AAA.BB,CC
- # A'AAA,BB.CC
- comma_idx = string.index(u",")
- stops_idx = string.index(u".")
- quotes_idx = string.index(u"'")
- if (comma_idx < stops_idx < quotes_idx
- or quotes_idx < stops_idx < comma_idx):
- string = string.replace(u",", u"").replace(u"'", u"")
- elif (comma_idx < quotes_idx < stops_idx
- or stops_idx < quotes_idx < comma_idx):
- string = string.replace(
- u",", u"").replace(
- u".", u"").replace(
- u"'", u".")
- else:
- string = string.replace(
- u"'", u"").replace(
- u".", u"").replace(
- u",", u".")
-
- elif stops and quotes:
- string = normalize_two(u".", u"'", string)
-
- elif commas and quotes:
- string = normalize_two(u",", u"'", string)
-
- elif commas and stops:
- string = normalize_two(u",", u".", string)
-
- elif commas:
- if string[-4:-3] == u"," and len(string) <= 7:
- # Single comma as a thousands separator.
- string = string.replace(u",", u"")
- else:
- # Single comma, not thousands - probably a decimal point.
- string = string.replace(u",", u".")
-
- elif quotes:
- # Single quote, probably MM'SS", equivalent to a decimal point.
- string = string.replace(u"'", u".")
-
- elif stops and string[-4:] == ".000":
- # Single stop, but no decimal - probably grouping.
- string = string.replace(u".", u"")
-
- return string