-def alnumsplit(string):
+import unicodedata
+
+CONTINUE_ON = frozenset([
+ "Ll", "Lm", "Lo", "Lt", "Lu",
+ "Mc", "Me", "Mn",
+ "Nd", "Nl", "No",
+ "Po",
+ "Zs",
+ ])
+
+UNKNOWN, LETTER, NUMBER = range(3)
+
+def sortemes(string):
+ """Generate a list of sortemes for the string.
+
+ A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
+ sort information. This is larger than a word boundry but smaller
+ than a sentence boundry; roughly, a sorteme boundry occurs between
+ letters and numbers, between numbers and numbrs if 'too much'
+ punctuation exists in between, between lines.
+
+ There is no formal specification for sortemes; the goal of this
+ function is to provide good output for Collator.sortemekey.
+ """
+
+ words = []
+ if not string:
+ return words
string = unicode(string)
- strings = []
- word = []
- numeric = None
- for char in string:
- if numeric is None:
- broke = False
- if char.isnumeric():
- numeric = True
- elif char.isalpha():
- numeric = False
- elif numeric and char.isalpha():
+ start = None
+ last = None
+ mode = UNKNOWN
+ previous_mode = UNKNOWN
+ category = "XX"
+ for i, c in enumerate(string):
+ broke = False
+ prev_category = category
+ this_mode = mode
+ category = unicodedata.category(c)
+
+ # Split at the first letter following a number or
+ # non-continuing character.
+ if category[0] == "L":
+ if mode != LETTER:
+ broke = True
+ mode = LETTER
+
+ # Split at the first number following a non-number or
+ # non-continuing character.
+ elif category[0] == "N":
+ if mode != NUMBER:
+ broke = True
+ mode = NUMBER
+
+ # Split if we find a non-continuing character ("weird" ones).
+ elif category not in CONTINUE_ON:
+ broke = True
+ mode = UNKNOWN
+
+ # Only certain punctuation allowed in numbers.
+ elif mode == NUMBER and category[0] == "P" and c not in "',._":
broke = True
- elif not numeric and char.isnumeric():
+ mode = UNKNOWN
+
+ # Split if we find two pieces of punctuation in a row, even
+ # if we should otherwise continue.
+ elif i > 0 and prev_category[0] == "P" and category[0] == "P":
broke = True
+ mode = UNKNOWN
+
+ if broke and start is not None and last is not None:
+ # If we read two strings separated by weird punctuation,
+ # pretend the punctuation isn't there.
+ if (this_mode == previous_mode == LETTER
+ and (category[0] == "P" or prev_category[0] == "P")
+ and words):
+ words[-1] += u" " + string[start:last+1]
+ else:
+ # This ensures "foo2 bar" sorts as ["foo ", 2, "bar"]
+ # Which sorts after ["foo", "bar"].
+ if this_mode == NUMBER and previous_mode == LETTER and words:
+ words[-1] += u" "
+ words.append(string[start:last+1])
+ previous_mode = this_mode
+
if broke:
- if word:
- strings.append("".join(word))
- word = []
- numeric = None
- word.append(char)
- if word:
- strings.append("".join(word))
- return strings
-
-def wordlike(string):
- """Check if a string is 'word-like'.
-
- Word-like strings contain at least one alphanumeric character.
- """
- return any(map(type(string).isalnum, string))
+ start = i
+ last = None
+ if category[0] in "LN":
+ last = i
+ this_mode = mode
+ if start is not None and last is not None:
+ if this_mode == LETTER and previous_mode == LETTER and words:
+ words[-1] += u" " + string[start:last+1]
+ else:
+ if this_mode == NUMBER and previous_mode == LETTER and words:
+ words[-1] += u" "
+ words.append(string[start:last+1])
+ return words
-def numeric(string, invalid=float('inf')):
- string = unicode(string)
- if not any(map(type(string).isnumeric, string)):
- return (invalid, string)
- if not string:
+def numeric(orig, invalid=float('inf')):
+ if not orig:
return (invalid, '')
+ string = unicode(orig)
+ for c in string:
+ if c.isnumeric():
+ break
+ else:
+ return (invalid, orig)
+
mult = 1
- while string[:1] == "-" or string[:1] == "+":
- if string[0] == "-":
+ while string[:1] == u"-" or string[:1] == u"+":
+ if string[:1] == u"-":
mult = -mult
string = string[1:]
- # Maybe we got lucky and this is a trivial case...
+ if not string[:1].isnumeric():
+ return (invalid, orig)
+
+ string = normalize_punc(string)
+
+ # Early out if possible.
try:
- return float(string) * mult
+ return (float(string) * mult, orig)
except ValueError:
pass
# Otherwise we need to do this the hard way.
- return mult * float(normalize_dots(string))
+ def _numeric(string):
+ total = 0
+ for c in string:
+ v = unicodedata.numeric(c)
+ if v >= 1 or v == 0:
+ total *= 10
+ total += v
+ return total
-def normalize_dots(string):
- string = unicode(string.strip(",.'"))
- string = filter(lambda u: u.isnumeric() or u in ",.'", string)
- commas = string.count(",")
- stops = string.count(".")
- quotes = string.count("'")
+ try:
+ whole, frac = string.split(".")
+ whole = _numeric(whole)
+ frac = _numeric(frac) / (10.0 ** len(frac))
+ return (mult * (whole + frac), orig)
+ except ValueError:
+ return (mult * _numeric(string), orig)
+
+def normalize_punc(string):
+ string = unicode(string.strip(u",.'"))
+ string = filter(lambda u: u.isnumeric() or u in u",.'", string)
+ commas = string.count(u",")
+ stops = string.count(u".")
+ quotes = string.count(u"'")
# If anything occurs more than once, it's a separator.
if commas > 1:
- string = string.replace(",", "")
+ string = string.replace(u",", u"")
commas = 0
if stops > 1:
- string = string.replace(".", "")
+ string = string.replace(u".", u"")
stops = 0
if quotes > 1:
- string = string.replace("'", "")
+ string = string.replace(u"'", u"")
quotes = 0
- def normalize_two(a, b):
+ def normalize_two(a, b, string):
# One of each - assume the first is grouping, second is point.
a_idx = string.rindex(a)
b_idx = string.rindex(b)
if a_idx > b_idx:
- string = string.replace(b, "").replace(a, ".")
+ string = string.replace(b, u"").replace(a, u".")
else:
- string = string.replace(a, "").replace(b, ".")
+ string = string.replace(a, u"").replace(b, u".")
return string
if commas and stops and quotes:
# Not really valid, so do whatever we want...
# A'AAA.BB,CC
# A'AAA,BB.CC
- comma_idx = string.index(",")
- stops_idx = string.index(".")
- quotes_idx = string.index("'")
+ comma_idx = string.index(u",")
+ stops_idx = string.index(u".")
+ quotes_idx = string.index(u"'")
if (comma_idx < stops_idx < quotes_idx
or quotes_idx < stops_idx < comma_idx):
- string = string.replace(",", "").replace("'", "")
+ string = string.replace(u",", u"").replace(u"'", u"")
elif (comma_idx < quotes_idx < stops_idx
or stops_idx < quotes_idx < comma_idx):
- string = string.replace(",", "").replace(".", "").replace("'", ".")
+ string = string.replace(
+ u",", u"").replace(
+ u".", u"").replace(
+ u"'", u".")
else:
- string = string.replace("'", "").replace(".", "").replace(",", ".")
+ string = string.replace(
+ u"'", u"").replace(
+ u".", u"").replace(
+ u",", u".")
elif stops and quotes:
- string = normalize_two('.', "'")
+ string = normalize_two(u".", u"'", string)
elif commas and quotes:
- string = normalize_two(',', "'")
+ string = normalize_two(u",", u"'", string)
elif commas and stops:
- string = normalize_two(',', '.')
+ string = normalize_two(u",", u".", string)
elif commas:
- if string[-4:-3] == "," and len(string) <= 7:
+ if string[-4:-3] == u"," and len(string) <= 7:
# Single comma as a thousands separator.
- string = string.replace(",", "")
+ string = string.replace(u",", u"")
else:
# Single comma, not thousands - probably a decimal point.
- string = string.replace(",", ".")
+ string = string.replace(u",", u".")
elif quotes:
# Single quote, probably MM'SS", equivalent to a decimal point.
- string = string.replace("'", ".")
+ string = string.replace(u"'", u".")
+
+ elif stops and string[-4:] == ".000":
+ # Single stop, but no decimal - probably grouping.
+ string = string.replace(u".", u"")
return string