UNKNOWN, LETTER, NUMBER = range(3)
-BREAKER = u"\u2029"
+BREAKER = u"\u2029" # Paragraph break character
+INFINITY = float('inf')
-def sortemes(string):
+def sortemes(string, key=lambda s: s):
"""Generate a list of sortemes for the string.
A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
sort information. This is larger than a word boundry but smaller
than a sentence boundry; roughly, a sorteme boundry occurs between
- letters and numbers, between numbers and numbrs if 'too much'
+ letters and numbers, between numbers and numbers if 'too much'
punctuation exists in between, between lines.
There is no formal specification for sortemes; the goal of this
"""
words = []
+ letters = []
+ digits = []
if not string:
return words
string = unicode(string)
- start = None
- last = None
- mode = UNKNOWN
- previous_mode = UNKNOWN
- category = "XX"
+ categories = map(unicodedata.category, string)
+ previous = UNKNOWN
+
+ def stripends(word):
+ while word and unicodedata.category(word[0])[0] in "PS":
+ word = word[1:]
+ while word and unicodedata.category(word[-1])[0] in "PS":
+ word = word[:-1]
+ return word
+
+ def aletters(letters):
+ words.append((INFINITY, stripends(letters)))
+ def adigits(digits):
+ words.append((numeric(digits), u''))
# TODO(jfw): This kind of evolved over time, there's probably a much
# faster / more concise way to express it now.
- for i, c in enumerate(string):
- broke = False
- prev_category = category
- this_mode = mode
- category = unicodedata.category(c)
+ for i, (c, category) in enumerate(zip(string, categories)):
+
+ if letters and previous == LETTER and words:
+ word = stripends(words.pop()[1].strip()) + BREAKER
+ letters.insert(0, word)
+ previous = UNKNOWN
# Split at the first letter following a number or
# non-continuing character.
if category[0] == "L":
- if mode != LETTER:
- broke = True
- mode = LETTER
+ letters.append(c)
+ if digits:
+ adigits(u"".join(digits).strip())
+ digits = []
+ previous = NUMBER
# Split at the first number following a non-number or
# non-continuing character.
elif category[0] == "N":
- if mode != NUMBER:
- broke = True
- mode = NUMBER
-
- # Split if we find a non-continuing character ("weird" ones).
- elif category not in CONTINUE_ON:
- broke = True
- mode = UNKNOWN
+ digits.append(c)
+ if letters:
+ aletters(u"".join(letters))
+ letters = []
+ previous = LETTER
# Only certain punctuation allowed in numbers.
- elif mode == NUMBER and category[0] == "P" and c not in "',._":
- broke = True
- mode = UNKNOWN
+ elif digits and c not in "',._":
+ adigits(u"".join(digits))
+ digits = []
+ previous = NUMBER
+
+ # Split if we find a non-continuing character ("weird" ones).
+ elif letters and category not in CONTINUE_ON:
+ if letters:
+ aletters(u"".join(letters).strip() + BREAKER)
+ letters = []
+ previous = LETTER
+ if digits:
+ adigits(u"".join(digits).strip())
+ digits = []
+ previous = NUMBER
# Split if we find two pieces of punctuation in a row, even
# if we should otherwise continue.
- elif prev_category[0] in "P" and category[0] in "P":
- broke = True
- mode = UNKNOWN
-
- if broke and start is not None and last is not None:
- # If we read two strings separated by weird punctuation,
- # pretend the punctuation isn't there.
- if this_mode == previous_mode == LETTER:
- words[-1] += BREAKER + string[start:last+1]
- else:
- if this_mode == NUMBER and previous_mode == LETTER:
- words[-1] += BREAKER
- words.append(string[start:last+1])
- previous_mode = this_mode
-
- if broke:
- start = i
- last = None
- if category[0] in "LN":
- last = i
- this_mode = mode
- if start is not None and last is not None:
- if this_mode == LETTER and previous_mode == LETTER and words:
- words[-1] += BREAKER + string[start:last+1]
+ elif i and categories[i-1][0] in "P" and category[0] in "P":
+ if letters:
+ aletters(u"".join(letters))
+ letters = []
+ previous = LETTER
+ if digits:
+ adigits(u"".join(digits))
+ digits = []
+ previous = NUMBER
+
else:
- if this_mode == NUMBER and previous_mode == LETTER and words:
- words[-1] += BREAKER
- words.append(string[start:last+1])
- return words
+ if digits:
+ digits.append(c)
+ elif letters:
+ letters.append(c)
-def numeric(orig, invalid=float('inf')):
+ if letters and previous == LETTER and words:
+ word = stripends(words.pop()[1].strip()) + BREAKER
+ letters.insert(0, word)
+ previous = UNKNOWN
+
+ if letters:
+ aletters(u"".join(letters))
+ if digits:
+ adigits(u"".join(digits))
+
+ return [(i, key(w) if w else u'') for i, w in words]
+
+def numeric(orig, invalid=INFINITY):
if not orig:
- return (invalid, '')
+ return invalid
string = unicode(orig)
for c in string:
if c.isnumeric():
break
else:
- return (invalid, orig)
+ return invalid
mult = 1
while string[:1] == u"-" or string[:1] == u"+":
string = normalize_punc(string)
- # Early out if possible.
- try:
- return (float(string) * mult, orig)
- except ValueError:
- pass
-
# Otherwise we need to do this the hard way.
def _numeric(string):
total = 0
whole, frac = string.split(".")
whole = _numeric(whole)
frac = _numeric(frac) / (10.0 ** len(frac))
- return (mult * (whole + frac), orig)
+ return mult * (whole + frac)
except ValueError:
- return (mult * _numeric(string), orig)
+ return mult * _numeric(string)
def normalize_punc(string):
string = unicode(string.strip(u",.'"))