5badc8c66ae9826dc8335a9c8d00e129610caaff
3 CONTINUE_ON
= frozenset([
4 "Ll", "Lm", "Lo", "Lt", "Lu",
11 UNKNOWN
, LETTER
, NUMBER
= range(3)
13 BREAKER
= u
"\u2029" # Paragraph break character
14 INFINITY
= float('inf')
16 def sortemes(string
, key
=lambda s
: s
):
17 """Generate a list of sortemes for the string.
19 A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
20 sort information. This is larger than a word boundry but smaller
21 than a sentence boundry; roughly, a sorteme boundry occurs between
22 letters and numbers, between numbers and numbers if 'too much'
23 punctuation exists in between, between lines.
25 There is no formal specification for sortemes; the goal of this
26 function is to provide good output for Collator.sortemekey.
34 string
= unicode(string
)
35 categories
= map(unicodedata
.category
, string
)
39 while word
and unicodedata
.category(word
[0])[0] in "PS":
41 while word
and unicodedata
.category(word
[-1])[0] in "PS":
45 def aletters(letters
):
46 words
.append((INFINITY
, stripends(letters
)))
48 words
.append((numeric(digits
), u
''))
50 # TODO(jfw): This kind of evolved over time, there's probably a much
51 # faster / more concise way to express it now.
52 for i
, (c
, category
) in enumerate(zip(string
, categories
)):
54 if letters
and previous
== LETTER
and words
:
55 word
= stripends(words
.pop()[1].strip()) + BREAKER
56 letters
.insert(0, word
)
59 # Split at the first letter following a number or
60 # non-continuing character.
61 if category
[0] == "L":
64 adigits(u
"".join(digits
).strip())
68 # Split at the first number following a non-number or
69 # non-continuing character.
70 elif category
[0] == "N":
73 aletters(u
"".join(letters
))
77 # Only certain punctuation allowed in numbers.
78 elif digits
and c
not in "',._":
79 adigits(u
"".join(digits
))
83 # Split if we find a non-continuing character ("weird" ones).
84 elif letters
and category
not in CONTINUE_ON
:
86 aletters(u
"".join(letters
).strip() + BREAKER
)
90 adigits(u
"".join(digits
).strip())
94 # Split if we find two pieces of punctuation in a row, even
95 # if we should otherwise continue.
96 elif i
and categories
[i
-1][0] in "P" and category
[0] in "P":
98 aletters(u
"".join(letters
))
102 adigits(u
"".join(digits
))
112 if letters
and previous
== LETTER
and words
:
113 word
= stripends(words
.pop()[1].strip()) + BREAKER
114 letters
.insert(0, word
)
118 aletters(u
"".join(letters
))
120 adigits(u
"".join(digits
))
122 return [(i
, key(w
) if w
else u
'') for i
, w
in words
]
124 def numeric(orig
, invalid
=INFINITY
):
128 string
= unicode(orig
)
136 while string
[:1] == u
"-" or string
[:1] == u
"+":
137 if string
[:1] == u
"-":
141 if not string
[:1].isnumeric():
142 return (invalid
, orig
)
144 string
= normalize_punc(string
)
146 # Otherwise we need to do this the hard way.
147 def _numeric(string
):
150 v
= unicodedata
.numeric(c
)
157 whole
, frac
= string
.split(".")
158 whole
= _numeric(whole
)
159 frac
= _numeric(frac
) / (10.0 ** len(frac
))
160 return mult
* (whole
+ frac
)
162 return mult
* _numeric(string
)
164 def normalize_punc(string
):
165 string
= unicode(string
.strip(u
",.'"))
166 string
= filter(lambda u
: u
.isnumeric() or u
in u
",.'", string
)
167 commas
= string
.count(u
",")
168 stops
= string
.count(u
".")
169 quotes
= string
.count(u
"'")
171 # If anything occurs more than once, it's a separator.
173 string
= string
.replace(u
",", u
"")
176 string
= string
.replace(u
".", u
"")
179 string
= string
.replace(u
"'", u
"")
182 def normalize_two(a
, b
, string
):
183 # One of each - assume the first is grouping, second is point.
184 a_idx
= string
.rindex(a
)
185 b_idx
= string
.rindex(b
)
187 string
= string
.replace(b
, u
"").replace(a
, u
".")
189 string
= string
.replace(a
, u
"").replace(b
, u
".")
192 if commas
and stops
and quotes
:
193 # If all three, assume the middle is the decimal point.
198 # Not really valid, so do whatever we want...
201 comma_idx
= string
.index(u
",")
202 stops_idx
= string
.index(u
".")
203 quotes_idx
= string
.index(u
"'")
204 if (comma_idx
< stops_idx
< quotes_idx
205 or quotes_idx
< stops_idx
< comma_idx
):
206 string
= string
.replace(u
",", u
"").replace(u
"'", u
"")
207 elif (comma_idx
< quotes_idx
< stops_idx
208 or stops_idx
< quotes_idx
< comma_idx
):
209 string
= string
.replace(
214 string
= string
.replace(
219 elif stops
and quotes
:
220 string
= normalize_two(u
".", u
"'", string
)
222 elif commas
and quotes
:
223 string
= normalize_two(u
",", u
"'", string
)
225 elif commas
and stops
:
226 string
= normalize_two(u
",", u
".", string
)
229 if string
[-4:-3] == u
"," and len(string
) <= 7:
230 # Single comma as a thousands separator.
231 string
= string
.replace(u
",", u
"")
233 # Single comma, not thousands - probably a decimal point.
234 string
= string
.replace(u
",", u
".")
237 # Single quote, probably MM'SS", equivalent to a decimal point.
238 string
= string
.replace(u
"'", u
".")
240 elif stops
and string
[-4:] == ".000":
241 # Single stop, but no decimal - probably grouping.
242 string
= string
.replace(u
".", u
"")