projects
/
python-collate.git
/ commitdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
| commitdiff |
tree
raw
|
patch
|
inline
| side by side (parent:
16cf8c5
)
strings: Microoptimizations, saves about 10% of runtime.
author
Joe Wreschnig
<joe.wreschnig@gmail.com>
Thu, 25 Feb 2010 23:54:14 +0000
(15:54 -0800)
committer
Joe Wreschnig
<joe.wreschnig@gmail.com>
Thu, 25 Feb 2010 23:54:14 +0000
(15:54 -0800)
collate/strings.py
patch
|
blob
|
history
diff --git
a/collate/strings.py
b/collate/strings.py
index
e2750d1
..
487257f
100644
(file)
--- a/
collate/strings.py
+++ b/
collate/strings.py
@@
-62,25
+62,23
@@
def sortemes(string, key=lambda s: s):
"""
"""
+ if not string:
+ return []
+
words = []
letters = []
digits = []
words = []
letters = []
digits = []
- if not string:
- return words
+ lappend = letters.append
+ dappend = digits.append
string = unicode(string)
categories = map(unicodedata.category, string)
previous = UNKNOWN
string = unicode(string)
categories = map(unicodedata.category, string)
previous = UNKNOWN
+ wappend = words.append
+ join = u"".join
+ i = 0
- def aletters(letters):
- """Add a group of letters to the word list."""
- words.append((INFINITY, stripends(letters)))
- def adigits(digits):
- """Add a group of digits to the word list."""
- words.append((numeric(digits), u''))
-
- # TODO(jfw): This kind of evolved over time, there's probably a much
- # faster / more concise way to express it now.
- for i, (uchar, category) in enumerate(zip(string, categories)):
+ for uchar in string:
+ category = categories[i]
if letters and previous == LETTER and words:
word = stripends(words.pop()[1].strip()) + BREAKER
if letters and previous == LETTER and words:
word = stripends(words.pop()[1].strip()) + BREAKER
@@
-90,57
+88,61
@@
def sortemes(string, key=lambda s: s):
# Split at the first letter following a number or
# non-continuing character.
if category[0] == "L":
# Split at the first letter following a number or
# non-continuing character.
if category[0] == "L":
- l
etters.
append(uchar)
+ lappend(uchar)
if digits:
if digits:
-
adigits(u"".join(digits).strip(
))
- d
igits = []
+
words.append((numeric(join(digits).strip()), u''
))
+ d
el(digits[:])
previous = NUMBER
# Split at the first number following a non-number or
# non-continuing character.
elif category[0] == "N":
previous = NUMBER
# Split at the first number following a non-number or
# non-continuing character.
elif category[0] == "N":
- d
igits.
append(uchar)
+ dappend(uchar)
if letters:
if unicodedata.category(letters[-1])[0] == "L":
if letters:
if unicodedata.category(letters[-1])[0] == "L":
- l
etters.
append(HBREAKER)
-
aletters(u"".join(letters
))
- letters = []
+ lappend(HBREAKER)
+
wappend((INFINITY, stripends(join(letters))
))
+ del(letters[:])
previous = LETTER
# Only certain punctuation allowed in numbers.
elif digits and uchar not in ALLOWED_IN_NUMBERS:
previous = LETTER
# Only certain punctuation allowed in numbers.
elif digits and uchar not in ALLOWED_IN_NUMBERS:
-
adigits(u"".join(digits
))
- d
igits = []
+
words.append((numeric(join(digits)), u''
))
+ d
el(digits[:])
previous = NUMBER
# Split if we find a non-continuing character ("weird" ones).
previous = NUMBER
# Split if we find a non-continuing character ("weird" ones).
- elif
letters and
category not in CONTINUE_ON:
+ elif category not in CONTINUE_ON:
if letters:
if letters:
- aletters(u"".join(letters).strip() + BREAKER)
- letters = []
+ wappend(
+ (INFINITY,
+ stripends(join(letters).strip() + BREAKER)))
+ del(letters[:])
previous = LETTER
if digits:
previous = LETTER
if digits:
-
adigits(u"".join(digits).strip(
))
- d
igits = []
+
words.append((numeric(join(digits)), u''
))
+ d
el(digits[:])
previous = NUMBER
# Split if we find two pieces of punctuation in a row, even
# if we should otherwise continue.
previous = NUMBER
# Split if we find two pieces of punctuation in a row, even
# if we should otherwise continue.
- elif i and categories[i
-1][0] in "P" and category[0] in
"P":
+ elif i and categories[i
- 1][0] == category[0] ==
"P":
if letters:
if letters:
-
aletters(u"".join(letters
))
- letters = []
+
wappend((INFINITY, stripends(join(letters))
))
+ del(letters[:])
previous = LETTER
if digits:
previous = LETTER
if digits:
-
adigits(u"".join(digits
))
- d
igits = []
+
words.append((numeric(join(digits)), u''
))
+ d
el(digits[:])
previous = NUMBER
else:
if digits:
previous = NUMBER
else:
if digits:
- d
igits.
append(uchar)
+ dappend(uchar)
elif letters:
elif letters:
- letters.append(uchar)
+ lappend(uchar)
+
+ i += 1
if letters and previous == LETTER and words:
word = stripends(words.pop()[1].strip()) + BREAKER
if letters and previous == LETTER and words:
word = stripends(words.pop()[1].strip()) + BREAKER
@@
-148,11
+150,11
@@
def sortemes(string, key=lambda s: s):
previous = UNKNOWN
if letters:
previous = UNKNOWN
if letters:
-
aletters(u"".join(letters
))
+
wappend((INFINITY, stripends(join(letters))
))
if digits:
if digits:
-
adigits(u"".join(digits
))
+
words.append((numeric(join(digits)), u''
))
- return [(i, key(w)
if w else u''
) for i, w in words]
+ return [(i, key(w)) for i, w in words]
def numeric(orig, invalid=INFINITY):
"""Parse a number out of a string.
def numeric(orig, invalid=INFINITY):
"""Parse a number out of a string.