+"""String utility functions for collation."""
+
+__all__ = ["sortemes", "numeric", "normalize_number", "deroman"]
+
import unicodedata
CONTINUE_ON = frozenset([
UNKNOWN, LETTER, NUMBER = range(3)
-BREAKER = u"\u2029" # Paragraph break character
+BREAKER = u"\u2028" # Line break character
+HBREAKER = u"\u2029" # Paragraph break character
INFINITY = float('inf')
+KEEP_IN_NUMBERS = u"'.,"
+ALLOWED_IN_NUMBERS = KEEP_IN_NUMBERS + u"_"
+
+ROMAN = {
+ u"i": 1,
+ u"v": 5,
+ u"x": 10,
+ u"l": 50,
+ u"c": 100,
+ u"d": 500,
+ u"m": 1000,
+ u"\u2180": 1000,
+ u"\u2181": 5000,
+ u"\u2182": 10000,
+ u"\u2183": 100,
+ u"\u2184": 100,
+ u"\u2185": 6,
+ u"\u2186": 50,
+ u"\u2187": 50000,
+ u"\u2188": 100000,
+ }
+
+def stripends(word):
+ """Strip punctuation and symbols from the ends of a string."""
+ while word and unicodedata.category(word[0])[0] in "PS":
+ word = word[1:]
+ while word and unicodedata.category(word[-1])[0] in "PS":
+ word = word[:-1]
+ return word
+
def sortemes(string, key=lambda s: s):
"""Generate a list of sortemes for the string.
There is no formal specification for sortemes; the goal of this
function is to provide good output for Collator.sortemekey.
+
"""
words = []
categories = map(unicodedata.category, string)
previous = UNKNOWN
- def stripends(word):
- while word and unicodedata.category(word[0])[0] in "PS":
- word = word[1:]
- while word and unicodedata.category(word[-1])[0] in "PS":
- word = word[:-1]
- return word
-
def aletters(letters):
+ """Add a group of letters to the word list."""
words.append((INFINITY, stripends(letters)))
def adigits(digits):
+ """Add a group of digits to the word list."""
words.append((numeric(digits), u''))
# TODO(jfw): This kind of evolved over time, there's probably a much
# faster / more concise way to express it now.
- for i, (c, category) in enumerate(zip(string, categories)):
+ for i, (uchar, category) in enumerate(zip(string, categories)):
if letters and previous == LETTER and words:
word = stripends(words.pop()[1].strip()) + BREAKER
# Split at the first letter following a number or
# non-continuing character.
if category[0] == "L":
- letters.append(c)
+ letters.append(uchar)
if digits:
adigits(u"".join(digits).strip())
digits = []
# Split at the first number following a non-number or
# non-continuing character.
elif category[0] == "N":
- digits.append(c)
+ digits.append(uchar)
if letters:
+ if unicodedata.category(letters[-1])[0] == "L":
+ letters.append(HBREAKER)
aletters(u"".join(letters))
letters = []
previous = LETTER
# Only certain punctuation allowed in numbers.
- elif digits and c not in "',._":
+ elif digits and uchar not in ALLOWED_IN_NUMBERS:
adigits(u"".join(digits))
digits = []
previous = NUMBER
else:
if digits:
- digits.append(c)
+ digits.append(uchar)
elif letters:
- letters.append(c)
+ letters.append(uchar)
if letters and previous == LETTER and words:
word = stripends(words.pop()[1].strip()) + BREAKER
return [(i, key(w) if w else u'') for i, w in words]
def numeric(orig, invalid=INFINITY):
+ """Parse a number out of a string.
+
+ This function parses a unicode number out of the start of a
+ string. If a number cannot be found at the start, the 'invalid'
+ argument is returned.
+
+ """
+
if not orig:
return invalid
string = unicode(orig)
- for c in string:
- if c.isnumeric():
+ for uchar in string:
+ if uchar.isnumeric():
break
else:
return invalid
+ for char in string:
+ if u"\u2160" <= char <= u"\u2188":
+ return deroman(string)
+
mult = 1
while string[:1] == u"-" or string[:1] == u"+":
if string[:1] == u"-":
string = string[1:]
if not string[:1].isnumeric():
- return (invalid, orig)
+ return invalid
- string = normalize_punc(string)
+ string = normalize_number(string)
- # Otherwise we need to do this the hard way.
def _numeric(string):
+ """Interpreter a number as base 10."""
total = 0
- for c in string:
- v = unicodedata.numeric(c)
- if v >= 1 or v == 0:
+ for uchar in string:
+ number = unicodedata.numeric(uchar)
+ if number >= 1 or number == 0:
total *= 10
- total += v
+ total += number
return total
try:
except ValueError:
return mult * _numeric(string)
-def normalize_punc(string):
- string = unicode(string.strip(u",.'"))
- string = filter(lambda u: u.isnumeric() or u in u",.'", string)
+def normalize_number(string):
+ """Normalize punctuation in a number.
+
+ This function attempts to guess which characters in a number
+ represent grouping separators and which represent decimal
+ points. It returns a string that is valid to pass to Python's
+ float() routine (potentially, NaN, if nothing like a number is
+ found).
+
+ """
+
+ string = unicode(string)
+ string = filter(lambda u: u.isnumeric() or u in KEEP_IN_NUMBERS, string)
+ string = string.strip(KEEP_IN_NUMBERS)
+
commas = string.count(u",")
stops = string.count(u".")
quotes = string.count(u"'")
quotes = 0
def normalize_two(a, b, string):
- # One of each - assume the first is grouping, second is point.
+ """One of each - assume the first is grouping, second is point."""
a_idx = string.rindex(a)
b_idx = string.rindex(b)
if a_idx > b_idx:
# Single stop, but no decimal - probably grouping.
string = string.replace(u".", u"")
- return string
+ return string or "NaN"
+
+def deroman(string):
+ """Turn a Roman numeral into an integer."""
+ string = unicodedata.normalize('NFKD', unicode(string)).lower()
+ previous = 0
+ building = 0
+ for char in reversed(string):
+ try:
+ value = ROMAN[char]
+ except KeyError:
+ continue
+ if value < previous:
+ building -= value
+ else:
+ building += value
+ previous = value
+ return building