"""Return negative if a < b, zero if a == b, positive if a > b."""
return cmp(self.key(string1), self.key(string2))
- def words(self, string):
- """Split the string into separate words.
-
- This split is done using Unicode's definition of whitespace.
- """
- return string.split()
-
- def sortemes(self, string):
- return collate._strings.alnumsplit(string)
-
def sortemekey(self, string, invalid=float('inf')):
- words = []
- for sorteme in self.sortemes(string):
+ keys = []
+ for sorteme in collate._strings.sortemes(string):
num, alpha = collate._strings.numeric(sorteme, invalid)
- alpha = self.key(collate._strings.strip_punc(alpha))
- words.append((num, alpha))
- return words
+ if num == invalid:
+ keys.append(self.key(alpha))
+ else:
+ keys.append(num)
+ # Shove the sortkeyed original string on the end to resolve
+ # ties intelligently.
+ return (keys, self.key(string))
import unicodedata
-def strip_punc(string):
- return filter(lambda c: unicodedata.category(c)[0] not in "PS", string)
-
-def strip_ends(string):
- while string and unicodedata.category(string[0])[0] in "ZPS":
- string = string[1:]
- while string and unicodedata.category(string[-1])[0] in "ZPS":
- string = string[:-1]
- return string
+CONTINUE_ON = frozenset([
+ "Ll", "Lm", "Lo", "Lt", "Lu",
+ "Mc", "Me", "Mn",
+ "Nd", "Nl", "No",
+ "Po",
+ "Zs",
+ ])
+
+UNKNOWN, LETTER, NUMBER = range(3)
+
+def sortemes(string):
+ """Generate a list of sortemes for the string.
+
+ A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
+ sort information. This is larger than a word boundry but smaller
+ than a sentence boundry; roughly, a sorteme boundry occurs between
+ letters and numbers, between numbers and numbrs if 'too much'
+ punctuation exists in between, between lines.
+
+ There is no formal specification for sortemes; the goal of this
+ function is to provide good output for Collator.sortemekey.
+ """
-def alnumsplit(string):
+ words = []
if not string:
- return []
+ return words
string = unicode(string)
- strings = []
- numeric = None
- start = 0
- for i, char in enumerate(string):
- category = unicodedata.category(char)
- if numeric is None:
- broke = False
- if char.isnumeric():
- numeric = True
- elif char.isalpha():
- numeric = False
- elif numeric and char.isalpha():
+ start = None
+ last = None
+ mode = UNKNOWN
+ previous_mode = UNKNOWN
+ category = "XX"
+ for i, c in enumerate(string):
+ broke = False
+ prev_category = category
+ this_mode = mode
+ category = unicodedata.category(c)
+
+ # Split at the first letter following a number or
+ # non-continuing character.
+ if category[0] == "L":
+ if mode != LETTER:
+ broke = True
+ mode = LETTER
+
+ # Split at the first number following a non-number or
+ # non-continuing character.
+ elif category[0] == "N":
+ if mode != NUMBER:
+ broke = True
+ mode = NUMBER
+
+ # Split if we find a non-continuing character ("weird" ones).
+ elif category not in CONTINUE_ON:
broke = True
- numeric = False
- elif numeric and category in ["Zs", "Ps", "Pe"]:
+ mode = UNKNOWN
+
+ # Only certain punctuation allowed in numbers.
+ elif mode == NUMBER and category[0] == "P" and c not in "',._":
broke = True
- numeric = None
- elif not numeric and char.isnumeric():
+ mode = UNKNOWN
+
+ # Split if we find two pieces of punctuation in a row, even
+ # if we should otherwise continue.
+ elif i > 0 and prev_category[0] == "P" and category[0] == "P":
broke = True
- numeric = True
+ mode = UNKNOWN
+
+ if broke and start is not None and last is not None:
+ # If we read two strings separated by weird punctuation,
+ # pretend the punctuation isn't there.
+ if (this_mode == previous_mode == LETTER
+ and prev_category[0] == "P"
+ and words):
+ words[-1] += u" " + string[start:last+1]
+ else:
+ # This ensures "foo2 bar" sorts as ["foo ", 2, "bar"]
+ # Which sorts after ["foo", "bar"].
+ if this_mode == NUMBER and previous_mode == LETTER and words:
+ words[-1] += u" "
+ words.append(string[start:last+1])
+ previous_mode = this_mode
+
if broke:
- strings.append(strip_ends(string[start:i]))
start = i
- broke = False
- strings.append(strip_ends(string[start:i + 1]))
- return strings
-
-def wordlike(string):
- """Check if a string is 'word-like'.
-
- Word-like strings contain at least one alphanumeric character.
- """
-
- # Explicit loop is faster than:
- #return any(map(type(string).isalnum, string))
-
- for c in string:
- if c.isalnum():
- return True
- else:
- return False
+ last = None
+ if category[0] in "LN":
+ last = i
+ if start is not None and last is not None:
+ if this_mode == previous_mode == LETTER and words:
+ words[-1] += u" " + string[start:last+1]
+ else:
+ if this_mode == NUMBER and previous_mode == LETTER and words:
+ words[-1] += u" "
+ words.append(string[start:last+1])
+ return words
def numeric(orig, invalid=float('inf')):
if not orig:
if not string[:1].isnumeric():
return (invalid, orig)
+ string = normalize_punc(string)
+
# Early out if possible.
try:
return (float(string) * mult, orig)
pass
# Otherwise we need to do this the hard way.
- string = normalize_punc(string)
-
def _numeric(string):
total = 0
for c in string:
# Single quote, probably MM'SS", equivalent to a decimal point.
string = string.replace(u"'", u".")
+ elif stops and string[-4:] == ".000":
+ # Single stop, but no decimal - probably grouping.
+ string = string.replace(u".", u"")
+
return string