UNKNOWN, LETTER, NUMBER = range(3)
-BREAKER = u"\u2029"
+BREAKER = u"\u2029" # Paragraph break character
def sortemes(string):
"""Generate a list of sortemes for the string.
"""
words = []
+ letters = []
+ digits = []
if not string:
return words
string = unicode(string)
- start = None
- last = None
- mode = UNKNOWN
- previous_mode = UNKNOWN
- category = "XX"
+ categories = map(unicodedata.category, string)
+ previous = UNKNOWN
+ types = []
+
+ def stripends(word):
+ while word and unicodedata.category(word[0])[0] in "PS":
+ word = word[1:]
+ while word and unicodedata.category(word[-1])[0] in "PS":
+ word = word[:-1]
+ return word
# TODO(jfw): This kind of evolved over time, there's probably a much
# faster / more concise way to express it now.
- for i, c in enumerate(string):
- broke = False
- prev_category = category
- this_mode = mode
- category = unicodedata.category(c)
+ for i, (c, category) in enumerate(zip(string, categories)):
+
+ if letters and previous == LETTER and words:
+ word = stripends(words.pop().strip())
+ letters = list(stripends(word).strip() + BREAKER) + letters
+ previous = UNKNOWN
# Split at the first letter following a number or
# non-continuing character.
if category[0] == "L":
- if mode != LETTER:
- broke = True
- mode = LETTER
+ letters.append(c)
+ if digits:
+ words.append(u"".join(digits).strip())
+ previous = NUMBER
+ digits = []
# Split at the first number following a non-number or
# non-continuing character.
elif category[0] == "N":
- if mode != NUMBER:
- broke = True
- mode = NUMBER
-
- # Split if we find a non-continuing character ("weird" ones).
- elif category not in CONTINUE_ON:
- broke = True
- mode = UNKNOWN
+ digits.append(c)
+ if letters:
+ words.append(u"".join(letters))
+ previous = LETTER
+ letters = []
# Only certain punctuation allowed in numbers.
- elif mode == NUMBER and category[0] == "P" and c not in "',._":
- broke = True
- mode = UNKNOWN
+ elif digits and c not in "',._":
+ words.append(u"".join(digits))
+ previous = NUMBER
+ digits = []
+
+ # Split if we find a non-continuing character ("weird" ones).
+ elif letters and category not in CONTINUE_ON:
+ if letters:
+ words.append(u"".join(letters).strip() + BREAKER)
+ previous = LETTER
+ letters = []
+ if digits:
+ words.append(u"".join(digits).strip() + BREAKER)
+ previous = NUMBER
+ digits = []
# Split if we find two pieces of punctuation in a row, even
# if we should otherwise continue.
- elif prev_category[0] in "P" and category[0] in "P":
- broke = True
- mode = UNKNOWN
-
- if broke and start is not None and last is not None:
- # If we read two strings separated by weird punctuation,
- # pretend the punctuation isn't there.
- if this_mode == previous_mode == LETTER:
- words[-1] += BREAKER + string[start:last+1]
- else:
- if this_mode == NUMBER and previous_mode == LETTER:
- words[-1] += BREAKER
- words.append(string[start:last+1])
- previous_mode = this_mode
-
- if broke:
- start = i
- last = None
- if category[0] in "LN":
- last = i
- this_mode = mode
- if start is not None and last is not None:
- if this_mode == LETTER and previous_mode == LETTER and words:
- words[-1] += BREAKER + string[start:last+1]
+ elif i and categories[i-1][0] in "P" and category[0] in "P":
+ if letters:
+ words.append(u"".join(letters))
+ previous = LETTER
+ letters = []
+ if digits:
+ words.append(u"".join(digits))
+ previous = NUMBER
+ digits = []
+
else:
- if this_mode == NUMBER and previous_mode == LETTER and words:
- words[-1] += BREAKER
- words.append(string[start:last+1])
+ if digits:
+ digits.append(c)
+ elif letters:
+ letters.append(c)
+
+ if letters and previous == LETTER and words:
+ word = stripends(words.pop().strip())
+ letters = list(stripends(word).strip() + BREAKER) + letters
+ previous = UNKNOWN
+
+ if letters:
+ words.append(u"".join(letters))
+ letters = []
+ if digits:
+ words.append(u"".join(digits))
+ digits = []
+
+ words = map(stripends, words)
return words
def numeric(orig, invalid=float('inf')):