3 CONTINUE_ON
= frozenset([
4 "Ll", "Lm", "Lo", "Lt", "Lu",
11 UNKNOWN
, LETTER
, NUMBER
= range(3)
16 """Generate a list of sortemes for the string.
18 A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
19 sort information. This is larger than a word boundry but smaller
20 than a sentence boundry; roughly, a sorteme boundry occurs between
21 letters and numbers, between numbers and numbrs if 'too much'
22 punctuation exists in between, between lines.
24 There is no formal specification for sortemes; the goal of this
25 function is to provide good output for Collator.sortemekey.
31 string
= unicode(string
)
35 previous_mode
= UNKNOWN
38 # TODO(jfw): This kind of evolved over time, there's probably a much
39 # faster / more concise way to express it now.
40 for i
, c
in enumerate(string
):
42 prev_category
= category
44 category
= unicodedata
.category(c
)
46 # Split at the first letter following a number or
47 # non-continuing character.
48 if category
[0] == "L":
53 # Split at the first number following a non-number or
54 # non-continuing character.
55 elif category
[0] == "N":
60 # Split if we find a non-continuing character ("weird" ones).
61 elif category
not in CONTINUE_ON
:
65 # Only certain punctuation allowed in numbers.
66 elif mode
== NUMBER
and category
[0] == "P" and c
not in "',._":
70 # Split if we find two pieces of punctuation in a row, even
71 # if we should otherwise continue.
72 elif i
> 0 and prev_category
[0] == "P" and category
[0] == "P":
76 if broke
and start
is not None and last
is not None:
77 # If we read two strings separated by weird punctuation,
78 # pretend the punctuation isn't there.
79 if (this_mode
== previous_mode
== LETTER
81 words
[-1] += BREAKER
+ string
[start
:last
+1]
83 # This ensures "foo2 bar" sorts as ["foo ", 2, "bar"]
84 # Which sorts after ["foo", "bar"].
85 if this_mode
== NUMBER
and previous_mode
== LETTER
and words
:
87 words
.append(string
[start
:last
+1])
88 previous_mode
= this_mode
93 if category
[0] in "LN":
96 if start
is not None and last
is not None:
97 if this_mode
== LETTER
and previous_mode
== LETTER
and words
:
98 words
[-1] += BREAKER
+ string
[start
:last
+1]
100 if this_mode
== NUMBER
and previous_mode
== LETTER
and words
:
102 words
.append(string
[start
:last
+1])
105 def numeric(orig
, invalid
=float('inf')):
109 string
= unicode(orig
)
114 return (invalid
, orig
)
117 while string
[:1] == u
"-" or string
[:1] == u
"+":
118 if string
[:1] == u
"-":
122 if not string
[:1].isnumeric():
123 return (invalid
, orig
)
125 string
= normalize_punc(string
)
127 # Early out if possible.
129 return (float(string
) * mult
, orig
)
133 # Otherwise we need to do this the hard way.
134 def _numeric(string
):
137 v
= unicodedata
.numeric(c
)
144 whole
, frac
= string
.split(".")
145 whole
= _numeric(whole
)
146 frac
= _numeric(frac
) / (10.0 ** len(frac
))
147 return (mult
* (whole
+ frac
), orig
)
149 return (mult
* _numeric(string
), orig
)
151 def normalize_punc(string
):
152 string
= unicode(string
.strip(u
",.'"))
153 string
= filter(lambda u
: u
.isnumeric() or u
in u
",.'", string
)
154 commas
= string
.count(u
",")
155 stops
= string
.count(u
".")
156 quotes
= string
.count(u
"'")
158 # If anything occurs more than once, it's a separator.
160 string
= string
.replace(u
",", u
"")
163 string
= string
.replace(u
".", u
"")
166 string
= string
.replace(u
"'", u
"")
169 def normalize_two(a
, b
, string
):
170 # One of each - assume the first is grouping, second is point.
171 a_idx
= string
.rindex(a
)
172 b_idx
= string
.rindex(b
)
174 string
= string
.replace(b
, u
"").replace(a
, u
".")
176 string
= string
.replace(a
, u
"").replace(b
, u
".")
179 if commas
and stops
and quotes
:
180 # If all three, assume the middle is the decimal point.
185 # Not really valid, so do whatever we want...
188 comma_idx
= string
.index(u
",")
189 stops_idx
= string
.index(u
".")
190 quotes_idx
= string
.index(u
"'")
191 if (comma_idx
< stops_idx
< quotes_idx
192 or quotes_idx
< stops_idx
< comma_idx
):
193 string
= string
.replace(u
",", u
"").replace(u
"'", u
"")
194 elif (comma_idx
< quotes_idx
< stops_idx
195 or stops_idx
< quotes_idx
< comma_idx
):
196 string
= string
.replace(
201 string
= string
.replace(
206 elif stops
and quotes
:
207 string
= normalize_two(u
".", u
"'", string
)
209 elif commas
and quotes
:
210 string
= normalize_two(u
",", u
"'", string
)
212 elif commas
and stops
:
213 string
= normalize_two(u
",", u
".", string
)
216 if string
[-4:-3] == u
"," and len(string
) <= 7:
217 # Single comma as a thousands separator.
218 string
= string
.replace(u
",", u
"")
220 # Single comma, not thousands - probably a decimal point.
221 string
= string
.replace(u
",", u
".")
224 # Single quote, probably MM'SS", equivalent to a decimal point.
225 string
= string
.replace(u
"'", u
".")
227 elif stops
and string
[-4:] == ".000":
228 # Single stop, but no decimal - probably grouping.
229 string
= string
.replace(u
".", u
"")