3 CONTINUE_ON
= frozenset([
4 "Ll", "Lm", "Lo", "Lt", "Lu",
11 UNKNOWN
, LETTER
, NUMBER
= range(3)
13 BREAKER
= u
"\u2029" # Paragraph break character
16 """Generate a list of sortemes for the string.
18 A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
19 sort information. This is larger than a word boundry but smaller
20 than a sentence boundry; roughly, a sorteme boundry occurs between
21 letters and numbers, between numbers and numbrs if 'too much'
22 punctuation exists in between, between lines.
24 There is no formal specification for sortemes; the goal of this
25 function is to provide good output for Collator.sortemekey.
33 string
= unicode(string
)
34 categories
= map(unicodedata
.category
, string
)
39 while word
and unicodedata
.category(word
[0])[0] in "PS":
41 while word
and unicodedata
.category(word
[-1])[0] in "PS":
45 # TODO(jfw): This kind of evolved over time, there's probably a much
46 # faster / more concise way to express it now.
47 for i
, (c
, category
) in enumerate(zip(string
, categories
)):
49 if letters
and previous
== LETTER
and words
:
50 word
= stripends(words
.pop().strip())
51 letters
= list(stripends(word
).strip() + BREAKER
) + letters
54 # Split at the first letter following a number or
55 # non-continuing character.
56 if category
[0] == "L":
59 words
.append(u
"".join(digits
).strip())
63 # Split at the first number following a non-number or
64 # non-continuing character.
65 elif category
[0] == "N":
68 words
.append(u
"".join(letters
))
72 # Only certain punctuation allowed in numbers.
73 elif digits
and c
not in "',._":
74 words
.append(u
"".join(digits
))
78 # Split if we find a non-continuing character ("weird" ones).
79 elif letters
and category
not in CONTINUE_ON
:
81 words
.append(u
"".join(letters
).strip() + BREAKER
)
85 words
.append(u
"".join(digits
).strip() + BREAKER
)
89 # Split if we find two pieces of punctuation in a row, even
90 # if we should otherwise continue.
91 elif i
and categories
[i
-1][0] in "P" and category
[0] in "P":
93 words
.append(u
"".join(letters
))
97 words
.append(u
"".join(digits
))
107 if letters
and previous
== LETTER
and words
:
108 word
= stripends(words
.pop().strip())
109 letters
= list(stripends(word
).strip() + BREAKER
) + letters
113 words
.append(u
"".join(letters
))
116 words
.append(u
"".join(digits
))
119 words
= map(stripends
, words
)
122 def numeric(orig
, invalid
=float('inf')):
126 string
= unicode(orig
)
131 return (invalid
, orig
)
134 while string
[:1] == u
"-" or string
[:1] == u
"+":
135 if string
[:1] == u
"-":
139 if not string
[:1].isnumeric():
140 return (invalid
, orig
)
142 string
= normalize_punc(string
)
144 # Early out if possible.
146 return (float(string
) * mult
, orig
)
150 # Otherwise we need to do this the hard way.
151 def _numeric(string
):
154 v
= unicodedata
.numeric(c
)
161 whole
, frac
= string
.split(".")
162 whole
= _numeric(whole
)
163 frac
= _numeric(frac
) / (10.0 ** len(frac
))
164 return (mult
* (whole
+ frac
), orig
)
166 return (mult
* _numeric(string
), orig
)
168 def normalize_punc(string
):
169 string
= unicode(string
.strip(u
",.'"))
170 string
= filter(lambda u
: u
.isnumeric() or u
in u
",.'", string
)
171 commas
= string
.count(u
",")
172 stops
= string
.count(u
".")
173 quotes
= string
.count(u
"'")
175 # If anything occurs more than once, it's a separator.
177 string
= string
.replace(u
",", u
"")
180 string
= string
.replace(u
".", u
"")
183 string
= string
.replace(u
"'", u
"")
186 def normalize_two(a
, b
, string
):
187 # One of each - assume the first is grouping, second is point.
188 a_idx
= string
.rindex(a
)
189 b_idx
= string
.rindex(b
)
191 string
= string
.replace(b
, u
"").replace(a
, u
".")
193 string
= string
.replace(a
, u
"").replace(b
, u
".")
196 if commas
and stops
and quotes
:
197 # If all three, assume the middle is the decimal point.
202 # Not really valid, so do whatever we want...
205 comma_idx
= string
.index(u
",")
206 stops_idx
= string
.index(u
".")
207 quotes_idx
= string
.index(u
"'")
208 if (comma_idx
< stops_idx
< quotes_idx
209 or quotes_idx
< stops_idx
< comma_idx
):
210 string
= string
.replace(u
",", u
"").replace(u
"'", u
"")
211 elif (comma_idx
< quotes_idx
< stops_idx
212 or stops_idx
< quotes_idx
< comma_idx
):
213 string
= string
.replace(
218 string
= string
.replace(
223 elif stops
and quotes
:
224 string
= normalize_two(u
".", u
"'", string
)
226 elif commas
and quotes
:
227 string
= normalize_two(u
",", u
"'", string
)
229 elif commas
and stops
:
230 string
= normalize_two(u
",", u
".", string
)
233 if string
[-4:-3] == u
"," and len(string
) <= 7:
234 # Single comma as a thousands separator.
235 string
= string
.replace(u
",", u
"")
237 # Single comma, not thousands - probably a decimal point.
238 string
= string
.replace(u
",", u
".")
241 # Single quote, probably MM'SS", equivalent to a decimal point.
242 string
= string
.replace(u
"'", u
".")
244 elif stops
and string
[-4:] == ".000":
245 # Single stop, but no decimal - probably grouping.
246 string
= string
.replace(u
".", u
"")