-def strip_punc(string):
- return filter(lambda c: unicodedata.category(c)[0] not in "PS", string)
-
-def strip_ends(string):
- while string and unicodedata.category(string[0])[0] in "ZPS":
- string = string[1:]
- while string and unicodedata.category(string[-1])[0] in "ZPS":
- string = string[:-1]
- return string
+CONTINUE_ON = frozenset([
+ "Ll", "Lm", "Lo", "Lt", "Lu",
+ "Mc", "Me", "Mn",
+ "Nd", "Nl", "No",
+ "Po",
+ "Zs",
+ ])
+
+UNKNOWN, LETTER, NUMBER = range(3)
+
+def sortemes(string):
+ """Generate a list of sortemes for the string.
+
+ A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
+ sort information. This is larger than a word boundry but smaller
+ than a sentence boundry; roughly, a sorteme boundry occurs between
+ letters and numbers, between numbers and numbrs if 'too much'
+ punctuation exists in between, between lines.
+
+ There is no formal specification for sortemes; the goal of this
+ function is to provide good output for Collator.sortemekey.
+ """