UNKNOWN, LETTER, NUMBER = range(3)
BREAKER = u"\u2029" # Paragraph break character
+INFINITY = float('inf')
-def sortemes(string):
+def sortemes(string, key=lambda s: s):
"""Generate a list of sortemes for the string.
A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
word = word[:-1]
return word
+ def aletters(letters):
+ words.append((INFINITY, stripends(letters)))
+ def adigits(digits):
+ words.append((numeric(digits), u''))
+
# TODO(jfw): This kind of evolved over time, there's probably a much
# faster / more concise way to express it now.
for i, (c, category) in enumerate(zip(string, categories)):
if letters and previous == LETTER and words:
- word = stripends(words.pop().strip())
- letters = list(stripends(word).strip() + BREAKER) + letters
+ word = stripends(words.pop()[1].strip()) + BREAKER
+ letters.insert(0, word)
previous = UNKNOWN
# Split at the first letter following a number or
if category[0] == "L":
letters.append(c)
if digits:
- words.append(u"".join(digits).strip())
- previous = NUMBER
+ adigits(u"".join(digits).strip())
digits = []
+ previous = NUMBER
# Split at the first number following a non-number or
# non-continuing character.
elif category[0] == "N":
digits.append(c)
if letters:
- words.append(u"".join(letters))
- previous = LETTER
+ aletters(u"".join(letters))
letters = []
+ previous = LETTER
# Only certain punctuation allowed in numbers.
elif digits and c not in "',._":
- words.append(u"".join(digits))
- previous = NUMBER
+ adigits(u"".join(digits))
digits = []
+ previous = NUMBER
# Split if we find a non-continuing character ("weird" ones).
elif letters and category not in CONTINUE_ON:
if letters:
- words.append(u"".join(letters).strip() + BREAKER)
- previous = LETTER
+ aletters(u"".join(letters).strip() + BREAKER)
letters = []
+ previous = LETTER
if digits:
- words.append(u"".join(digits).strip() + BREAKER)
- previous = NUMBER
+ adigits(u"".join(digits).strip())
digits = []
+ previous = NUMBER
# Split if we find two pieces of punctuation in a row, even
# if we should otherwise continue.
elif i and categories[i-1][0] in "P" and category[0] in "P":
if letters:
- words.append(u"".join(letters))
- previous = LETTER
+ aletters(u"".join(letters))
letters = []
+ previous = LETTER
if digits:
- words.append(u"".join(digits))
- previous = NUMBER
+ adigits(u"".join(digits))
digits = []
+ previous = NUMBER
else:
if digits:
letters.append(c)
if letters and previous == LETTER and words:
- word = stripends(words.pop().strip())
- letters = list(stripends(word).strip() + BREAKER) + letters
+ word = stripends(words.pop()[1].strip()) + BREAKER
+ letters.insert(0, word)
previous = UNKNOWN
if letters:
- words.append(u"".join(letters))
- letters = []
+ aletters(u"".join(letters))
if digits:
- words.append(u"".join(digits))
- digits = []
+ adigits(u"".join(digits))
- words = map(stripends, words)
- return words
+ return [(i, key(w) if w else u'') for i, w in words]
def numeric(orig, invalid=float('inf')):
if not orig:
- return (invalid, '')
+ return invalid
string = unicode(orig)
for c in string:
if c.isnumeric():
break
else:
- return (invalid, orig)
+ return invalid
mult = 1
while string[:1] == u"-" or string[:1] == u"+":
# Early out if possible.
try:
- return (float(string) * mult, orig)
+ return float(string) * mult
except ValueError:
pass
whole, frac = string.split(".")
whole = _numeric(whole)
frac = _numeric(frac) / (10.0 ** len(frac))
- return (mult * (whole + frac), orig)
+ return mult * (whole + frac)
except ValueError:
- return (mult * _numeric(string), orig)
+ return mult * _numeric(string)
def normalize_punc(string):
string = unicode(string.strip(u",.'"))