"""String utility functions for collation."""
-__all__ = ["sortemes", "numeric", "normalize_number"]
+__all__ = ["sortemes", "numeric", "normalize_number", "deroman"]
import unicodedata
UNKNOWN, LETTER, NUMBER = range(3)
-BREAKER = u"\u2029" # Paragraph break character
+BREAKER = u"\u2028" # Line break character
+HBREAKER = u"\u2029" # Paragraph break character
INFINITY = float('inf')
KEEP_IN_NUMBERS = u"'.,"
ALLOWED_IN_NUMBERS = KEEP_IN_NUMBERS + u"_"
+ROMAN = {
+ u"i": 1,
+ u"v": 5,
+ u"x": 10,
+ u"l": 50,
+ u"c": 100,
+ u"d": 500,
+ u"m": 1000,
+ u"\u2180": 1000,
+ u"\u2181": 5000,
+ u"\u2182": 10000,
+ u"\u2183": 100,
+ u"\u2184": 100,
+ u"\u2185": 6,
+ u"\u2186": 50,
+ u"\u2187": 50000,
+ u"\u2188": 100000,
+ }
+
+INITIAL_STOPS = frozenset([u"a", u"an", u"the"])
+
def stripends(word):
"""Strip punctuation and symbols from the ends of a string."""
while word and unicodedata.category(word[0])[0] in "PS":
"""
+ if not string:
+ return []
+
words = []
letters = []
digits = []
- if not string:
- return words
+ lappend = letters.append
+ dappend = digits.append
string = unicode(string)
categories = map(unicodedata.category, string)
previous = UNKNOWN
+ wappend = words.append
+ join = u"".join
+ i = 0
- def aletters(letters):
- """Add a group of letters to the word list."""
- words.append((INFINITY, stripends(letters)))
- def adigits(digits):
- """Add a group of digits to the word list."""
- words.append((numeric(digits), u''))
-
- # TODO(jfw): This kind of evolved over time, there's probably a much
- # faster / more concise way to express it now.
- for i, (uchar, category) in enumerate(zip(string, categories)):
+ for uchar in string:
+ category = categories[i]
if letters and previous == LETTER and words:
word = stripends(words.pop()[1].strip()) + BREAKER
# Split at the first letter following a number or
# non-continuing character.
if category[0] == "L":
- letters.append(uchar)
+ lappend(uchar)
if digits:
- adigits(u"".join(digits).strip())
- digits = []
+ words.append((numeric(join(digits).strip()), u''))
+ del(digits[:])
previous = NUMBER
# Split at the first number following a non-number or
# non-continuing character.
elif category[0] == "N":
- digits.append(uchar)
+ dappend(uchar)
if letters:
- aletters(u"".join(letters))
- letters = []
+ if unicodedata.category(letters[-1])[0] == "L":
+ lappend(HBREAKER)
+ wappend((INFINITY, stripends(join(letters))))
+ del(letters[:])
previous = LETTER
# Only certain punctuation allowed in numbers.
elif digits and uchar not in ALLOWED_IN_NUMBERS:
- adigits(u"".join(digits))
- digits = []
+ words.append((numeric(join(digits)), u''))
+ del(digits[:])
previous = NUMBER
# Split if we find a non-continuing character ("weird" ones).
- elif letters and category not in CONTINUE_ON:
+ elif category not in CONTINUE_ON:
if letters:
- aletters(u"".join(letters).strip() + BREAKER)
- letters = []
+ wappend(
+ (INFINITY,
+ stripends(join(letters).strip() + BREAKER)))
+ del(letters[:])
previous = LETTER
if digits:
- adigits(u"".join(digits).strip())
- digits = []
+ words.append((numeric(join(digits)), u''))
+ del(digits[:])
previous = NUMBER
# Split if we find two pieces of punctuation in a row, even
# if we should otherwise continue.
- elif i and categories[i-1][0] in "P" and category[0] in "P":
+ elif i and categories[i - 1][0] == category[0] == "P":
if letters:
- aletters(u"".join(letters))
- letters = []
+ wappend((INFINITY, stripends(join(letters))))
+ del(letters[:])
previous = LETTER
if digits:
- adigits(u"".join(digits))
- digits = []
+ words.append((numeric(join(digits)), u''))
+ del(digits[:])
previous = NUMBER
else:
if digits:
- digits.append(uchar)
+ dappend(uchar)
elif letters:
- letters.append(uchar)
+ lappend(uchar)
+
+ i += 1
if letters and previous == LETTER and words:
word = stripends(words.pop()[1].strip()) + BREAKER
previous = UNKNOWN
if letters:
- aletters(u"".join(letters))
+ wappend((INFINITY, stripends(join(letters))))
if digits:
- adigits(u"".join(digits))
+ words.append((numeric(join(digits)), u''))
- return [(i, key(w) if w else u'') for i, w in words]
+ return [(i, key(w)) for i, w in words]
def numeric(orig, invalid=INFINITY):
"""Parse a number out of a string.
else:
return invalid
+ for char in string:
+ if u"\u2160" <= char <= u"\u2188":
+ return deroman(string)
+
mult = 1
while string[:1] == u"-" or string[:1] == u"+":
if string[:1] == u"-":
string = string.replace(u".", u"")
return string or "NaN"
+
+def deroman(string):
+ """Turn a Roman numeral into an integer."""
+ string = unicodedata.normalize('NFKD', unicode(string)).lower()
+ previous = 0
+ building = 0
+ for char in reversed(string):
+ try:
+ value = ROMAN[char]
+ except KeyError:
+ continue
+ if value < previous:
+ building -= value
+ else:
+ building += value
+ previous = value
+ return building