-import collate._strings
+import collate.strings
class Collator(object):
def cmp(self, string1, string2):
"""Return negative if a < b, zero if a == b, positive if a > b."""
return cmp(self.key(string1), self.key(string2))
+ def words(self, string):
+ """Split the string along word boundries."""
+ if isinstance(string, str):
+ string = string.decode(self.encoding, 'replace')
+ return string.split()
+
def sortemekey(self, string, invalid=float('inf')):
+ """Return a key based on sortemes of a string.
+
+ If the string is a str instance, it is decoded to a unicode
+ instance according to the 'encoding' attribute of the
+ Collator.
+ """
keys = []
- for sorteme in collate._strings.sortemes(string):
- num, alpha = collate._strings.numeric(sorteme, invalid)
+ if isinstance(string, str):
+ string = string.decode(self.encoding, 'replace')
+ for sorteme in collate.strings.sortemes(string):
+ num, alpha = collate.strings.numeric(sorteme, invalid)
if num == invalid:
keys.append(self.key(alpha))
else:
+++ /dev/null
-import unicodedata
-
-CONTINUE_ON = frozenset([
- "Ll", "Lm", "Lo", "Lt", "Lu",
- "Mc", "Me", "Mn",
- "Nd", "Nl", "No",
- "Po",
- "Zs",
- ])
-
-UNKNOWN, LETTER, NUMBER = range(3)
-
-def sortemes(string):
- """Generate a list of sortemes for the string.
-
- A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
- sort information. This is larger than a word boundry but smaller
- than a sentence boundry; roughly, a sorteme boundry occurs between
- letters and numbers, between numbers and numbrs if 'too much'
- punctuation exists in between, between lines.
-
- There is no formal specification for sortemes; the goal of this
- function is to provide good output for Collator.sortemekey.
- """
-
- words = []
- if not string:
- return words
- string = unicode(string)
- start = None
- last = None
- mode = UNKNOWN
- previous_mode = UNKNOWN
- category = "XX"
- for i, c in enumerate(string):
- broke = False
- prev_category = category
- this_mode = mode
- category = unicodedata.category(c)
-
- # Split at the first letter following a number or
- # non-continuing character.
- if category[0] == "L":
- if mode != LETTER:
- broke = True
- mode = LETTER
-
- # Split at the first number following a non-number or
- # non-continuing character.
- elif category[0] == "N":
- if mode != NUMBER:
- broke = True
- mode = NUMBER
-
- # Split if we find a non-continuing character ("weird" ones).
- elif category not in CONTINUE_ON:
- broke = True
- mode = UNKNOWN
-
- # Only certain punctuation allowed in numbers.
- elif mode == NUMBER and category[0] == "P" and c not in "',._":
- broke = True
- mode = UNKNOWN
-
- # Split if we find two pieces of punctuation in a row, even
- # if we should otherwise continue.
- elif i > 0 and prev_category[0] == "P" and category[0] == "P":
- broke = True
- mode = UNKNOWN
-
- if broke and start is not None and last is not None:
- # If we read two strings separated by weird punctuation,
- # pretend the punctuation isn't there.
- if (this_mode == previous_mode == LETTER
- and (category[0] == "P" or prev_category[0] == "P")
- and words):
- words[-1] += u" " + string[start:last+1]
- else:
- # This ensures "foo2 bar" sorts as ["foo ", 2, "bar"]
- # Which sorts after ["foo", "bar"].
- if this_mode == NUMBER and previous_mode == LETTER and words:
- words[-1] += u" "
- words.append(string[start:last+1])
- previous_mode = this_mode
-
- if broke:
- start = i
- last = None
- if category[0] in "LN":
- last = i
- this_mode = mode
- if start is not None and last is not None:
- if this_mode == LETTER and previous_mode == LETTER and words:
- words[-1] += u" " + string[start:last+1]
- else:
- if this_mode == NUMBER and previous_mode == LETTER and words:
- words[-1] += u" "
- words.append(string[start:last+1])
- return words
-
-def numeric(orig, invalid=float('inf')):
- if not orig:
- return (invalid, '')
-
- string = unicode(orig)
- for c in string:
- if c.isnumeric():
- break
- else:
- return (invalid, orig)
-
- mult = 1
- while string[:1] == u"-" or string[:1] == u"+":
- if string[:1] == u"-":
- mult = -mult
- string = string[1:]
-
- if not string[:1].isnumeric():
- return (invalid, orig)
-
- string = normalize_punc(string)
-
- # Early out if possible.
- try:
- return (float(string) * mult, orig)
- except ValueError:
- pass
-
- # Otherwise we need to do this the hard way.
- def _numeric(string):
- total = 0
- for c in string:
- v = unicodedata.numeric(c)
- if v >= 1 or v == 0:
- total *= 10
- total += v
- return total
-
- try:
- whole, frac = string.split(".")
- whole = _numeric(whole)
- frac = _numeric(frac) / (10.0 ** len(frac))
- return (mult * (whole + frac), orig)
- except ValueError:
- return (mult * _numeric(string), orig)
-
-def normalize_punc(string):
- string = unicode(string.strip(u",.'"))
- string = filter(lambda u: u.isnumeric() or u in u",.'", string)
- commas = string.count(u",")
- stops = string.count(u".")
- quotes = string.count(u"'")
-
- # If anything occurs more than once, it's a separator.
- if commas > 1:
- string = string.replace(u",", u"")
- commas = 0
- if stops > 1:
- string = string.replace(u".", u"")
- stops = 0
- if quotes > 1:
- string = string.replace(u"'", u"")
- quotes = 0
-
- def normalize_two(a, b, string):
- # One of each - assume the first is grouping, second is point.
- a_idx = string.rindex(a)
- b_idx = string.rindex(b)
- if a_idx > b_idx:
- string = string.replace(b, u"").replace(a, u".")
- else:
- string = string.replace(a, u"").replace(b, u".")
- return string
-
- if commas and stops and quotes:
- # If all three, assume the middle is the decimal point.
- # A,AAA.BB'CC
- # A.AAA,BB'CC
- # A,AAA'BB.CC
- # A.AAA'BB,CC
- # Not really valid, so do whatever we want...
- # A'AAA.BB,CC
- # A'AAA,BB.CC
- comma_idx = string.index(u",")
- stops_idx = string.index(u".")
- quotes_idx = string.index(u"'")
- if (comma_idx < stops_idx < quotes_idx
- or quotes_idx < stops_idx < comma_idx):
- string = string.replace(u",", u"").replace(u"'", u"")
- elif (comma_idx < quotes_idx < stops_idx
- or stops_idx < quotes_idx < comma_idx):
- string = string.replace(
- u",", u"").replace(
- u".", u"").replace(
- u"'", u".")
- else:
- string = string.replace(
- u"'", u"").replace(
- u".", u"").replace(
- u",", u".")
-
- elif stops and quotes:
- string = normalize_two(u".", u"'", string)
-
- elif commas and quotes:
- string = normalize_two(u",", u"'", string)
-
- elif commas and stops:
- string = normalize_two(u",", u".", string)
-
- elif commas:
- if string[-4:-3] == u"," and len(string) <= 7:
- # Single comma as a thousands separator.
- string = string.replace(u",", u"")
- else:
- # Single comma, not thousands - probably a decimal point.
- string = string.replace(u",", u".")
-
- elif quotes:
- # Single quote, probably MM'SS", equivalent to a decimal point.
- string = string.replace(u"'", u".")
-
- elif stops and string[-4:] == ".000":
- # Single stop, but no decimal - probably grouping.
- string = string.replace(u".", u"")
-
- return string
# so this is a harmless error.
self._breaker = _icu.WordBreaker("root")
+ def words(self, string):
+ """Split the string along word boundries."""
+ if isinstance(string, str):
+ string = string.decode(self.encoding)
+ words = self._breaker.words(string)
+ return [w for w in words if not w.isspace()]
+
def key(self, string):
"""Sort key for a string.
--- /dev/null
+import unicodedata
+
+CONTINUE_ON = frozenset([
+ "Ll", "Lm", "Lo", "Lt", "Lu",
+ "Mc", "Me", "Mn",
+ "Nd", "Nl", "No",
+ "Po",
+ "Zs",
+ ])
+
+UNKNOWN, LETTER, NUMBER = range(3)
+
+BREAKER = u"\u2029"
+
+def sortemes(string):
+ """Generate a list of sortemes for the string.
+
+ A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
+ sort information. This is larger than a word boundry but smaller
+ than a sentence boundry; roughly, a sorteme boundry occurs between
+ letters and numbers, between numbers and numbrs if 'too much'
+ punctuation exists in between, between lines.
+
+ There is no formal specification for sortemes; the goal of this
+ function is to provide good output for Collator.sortemekey.
+ """
+
+ words = []
+ if not string:
+ return words
+ string = unicode(string)
+ start = None
+ last = None
+ mode = UNKNOWN
+ previous_mode = UNKNOWN
+ category = "XX"
+
+ # TODO(jfw): This kind of evolved over time, there's probably a much
+ # faster / more concise way to express it now.
+ for i, c in enumerate(string):
+ broke = False
+ prev_category = category
+ this_mode = mode
+ category = unicodedata.category(c)
+
+ # Split at the first letter following a number or
+ # non-continuing character.
+ if category[0] == "L":
+ if mode != LETTER:
+ broke = True
+ mode = LETTER
+
+ # Split at the first number following a non-number or
+ # non-continuing character.
+ elif category[0] == "N":
+ if mode != NUMBER:
+ broke = True
+ mode = NUMBER
+
+ # Split if we find a non-continuing character ("weird" ones).
+ elif category not in CONTINUE_ON:
+ broke = True
+ mode = UNKNOWN
+
+ # Only certain punctuation allowed in numbers.
+ elif mode == NUMBER and category[0] == "P" and c not in "',._":
+ broke = True
+ mode = UNKNOWN
+
+ # Split if we find two pieces of punctuation in a row, even
+ # if we should otherwise continue.
+ elif i > 0 and prev_category[0] == "P" and category[0] == "P":
+ broke = True
+ mode = UNKNOWN
+
+ if broke and start is not None and last is not None:
+ # If we read two strings separated by weird punctuation,
+ # pretend the punctuation isn't there.
+ if (this_mode == previous_mode == LETTER
+ and words):
+ words[-1] += BREAKER + string[start:last+1]
+ else:
+ # This ensures "foo2 bar" sorts as ["foo ", 2, "bar"]
+ # Which sorts after ["foo", "bar"].
+ if this_mode == NUMBER and previous_mode == LETTER and words:
+ words[-1] += BREAKER
+ words.append(string[start:last+1])
+ previous_mode = this_mode
+
+ if broke:
+ start = i
+ last = None
+ if category[0] in "LN":
+ last = i
+ this_mode = mode
+ if start is not None and last is not None:
+ if this_mode == LETTER and previous_mode == LETTER and words:
+ words[-1] += BREAKER + string[start:last+1]
+ else:
+ if this_mode == NUMBER and previous_mode == LETTER and words:
+ words[-1] += BREAKER
+ words.append(string[start:last+1])
+ return words
+
+def numeric(orig, invalid=float('inf')):
+ if not orig:
+ return (invalid, '')
+
+ string = unicode(orig)
+ for c in string:
+ if c.isnumeric():
+ break
+ else:
+ return (invalid, orig)
+
+ mult = 1
+ while string[:1] == u"-" or string[:1] == u"+":
+ if string[:1] == u"-":
+ mult = -mult
+ string = string[1:]
+
+ if not string[:1].isnumeric():
+ return (invalid, orig)
+
+ string = normalize_punc(string)
+
+ # Early out if possible.
+ try:
+ return (float(string) * mult, orig)
+ except ValueError:
+ pass
+
+ # Otherwise we need to do this the hard way.
+ def _numeric(string):
+ total = 0
+ for c in string:
+ v = unicodedata.numeric(c)
+ if v >= 1 or v == 0:
+ total *= 10
+ total += v
+ return total
+
+ try:
+ whole, frac = string.split(".")
+ whole = _numeric(whole)
+ frac = _numeric(frac) / (10.0 ** len(frac))
+ return (mult * (whole + frac), orig)
+ except ValueError:
+ return (mult * _numeric(string), orig)
+
+def normalize_punc(string):
+ string = unicode(string.strip(u",.'"))
+ string = filter(lambda u: u.isnumeric() or u in u",.'", string)
+ commas = string.count(u",")
+ stops = string.count(u".")
+ quotes = string.count(u"'")
+
+ # If anything occurs more than once, it's a separator.
+ if commas > 1:
+ string = string.replace(u",", u"")
+ commas = 0
+ if stops > 1:
+ string = string.replace(u".", u"")
+ stops = 0
+ if quotes > 1:
+ string = string.replace(u"'", u"")
+ quotes = 0
+
+ def normalize_two(a, b, string):
+ # One of each - assume the first is grouping, second is point.
+ a_idx = string.rindex(a)
+ b_idx = string.rindex(b)
+ if a_idx > b_idx:
+ string = string.replace(b, u"").replace(a, u".")
+ else:
+ string = string.replace(a, u"").replace(b, u".")
+ return string
+
+ if commas and stops and quotes:
+ # If all three, assume the middle is the decimal point.
+ # A,AAA.BB'CC
+ # A.AAA,BB'CC
+ # A,AAA'BB.CC
+ # A.AAA'BB,CC
+ # Not really valid, so do whatever we want...
+ # A'AAA.BB,CC
+ # A'AAA,BB.CC
+ comma_idx = string.index(u",")
+ stops_idx = string.index(u".")
+ quotes_idx = string.index(u"'")
+ if (comma_idx < stops_idx < quotes_idx
+ or quotes_idx < stops_idx < comma_idx):
+ string = string.replace(u",", u"").replace(u"'", u"")
+ elif (comma_idx < quotes_idx < stops_idx
+ or stops_idx < quotes_idx < comma_idx):
+ string = string.replace(
+ u",", u"").replace(
+ u".", u"").replace(
+ u"'", u".")
+ else:
+ string = string.replace(
+ u"'", u"").replace(
+ u".", u"").replace(
+ u",", u".")
+
+ elif stops and quotes:
+ string = normalize_two(u".", u"'", string)
+
+ elif commas and quotes:
+ string = normalize_two(u",", u"'", string)
+
+ elif commas and stops:
+ string = normalize_two(u",", u".", string)
+
+ elif commas:
+ if string[-4:-3] == u"," and len(string) <= 7:
+ # Single comma as a thousands separator.
+ string = string.replace(u",", u"")
+ else:
+ # Single comma, not thousands - probably a decimal point.
+ string = string.replace(u",", u".")
+
+ elif quotes:
+ # Single quote, probably MM'SS", equivalent to a decimal point.
+ string = string.replace(u"'", u".")
+
+ elif stops and string[-4:] == ".000":
+ # Single stop, but no decimal - probably grouping.
+ string = string.replace(u".", u"")
+
+ return string