16ba8be7328aabdbd1246936f32aafa1a6057c72
1 """String utility functions for collation."""
3 __all__
= ["sortemes", "numeric", "normalize_number", "deroman"]
7 CONTINUE_ON
= frozenset([
8 "Ll", "Lm", "Lo", "Lt", "Lu",
15 UNKNOWN
, LETTER
, NUMBER
= range(3)
17 BREAKER
= u
"\u2028" # Line break character
18 HBREAKER
= u
"\u2029" # Paragraph break character
19 INFINITY
= float('inf')
21 KEEP_IN_NUMBERS
= u
"'.,"
22 ALLOWED_IN_NUMBERS
= KEEP_IN_NUMBERS
+ u
"_"
43 INITIAL_STOPS
= frozenset([u
"a", u
"an", u
"the"])
46 """Strip punctuation and symbols from the ends of a string."""
47 while word
and unicodedata
.category(word
[0])[0] in "PS":
49 while word
and unicodedata
.category(word
[-1])[0] in "PS":
53 def sortemes(string
, key
=lambda s
: s
):
54 """Generate a list of sortemes for the string.
56 A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
57 sort information. This is larger than a word boundry but smaller
58 than a sentence boundry; roughly, a sorteme boundry occurs between
59 letters and numbers, between numbers and numbers if 'too much'
60 punctuation exists in between, between lines.
62 There is no formal specification for sortemes; the goal of this
63 function is to provide good output for Collator.sortemekey.
73 lappend
= letters
.append
74 dappend
= digits
.append
75 string
= unicode(string
)
76 categories
= map(unicodedata
.category
, string
)
78 wappend
= words
.append
83 category
= categories
[i
]
85 if letters
and previous
== LETTER
and words
:
86 word
= stripends(words
.pop()[1].strip()) + BREAKER
87 letters
.insert(0, word
)
90 # Split at the first letter following a number or
91 # non-continuing character.
92 if category
[0] == "L":
95 words
.append((numeric(join(digits
).strip()), u
''))
99 # Split at the first number following a non-number or
100 # non-continuing character.
101 elif category
[0] == "N":
104 if unicodedata
.category(letters
[-1])[0] == "L":
106 wappend((INFINITY
, stripends(join(letters
))))
110 # Only certain punctuation allowed in numbers.
111 elif digits
and uchar
not in ALLOWED_IN_NUMBERS
:
112 words
.append((numeric(join(digits
)), u
''))
116 # Split if we find a non-continuing character ("weird" ones).
117 elif category
not in CONTINUE_ON
:
121 stripends(join(letters
).strip() + BREAKER
)))
125 words
.append((numeric(join(digits
)), u
''))
129 # Split if we find two pieces of punctuation in a row, even
130 # if we should otherwise continue.
131 elif i
and categories
[i
- 1][0] == category
[0] == "P":
133 wappend((INFINITY
, stripends(join(letters
))))
137 words
.append((numeric(join(digits
)), u
''))
149 if letters
and previous
== LETTER
and words
:
150 word
= stripends(words
.pop()[1].strip()) + BREAKER
151 letters
.insert(0, word
)
155 wappend((INFINITY
, stripends(join(letters
))))
157 words
.append((numeric(join(digits
)), u
''))
159 return [(i
, key(w
)) for i
, w
in words
]
161 def numeric(orig
, invalid
=INFINITY
):
162 """Parse a number out of a string.
164 This function parses a unicode number out of the start of a
165 string. If a number cannot be found at the start, the 'invalid'
166 argument is returned.
173 string
= unicode(orig
)
175 if uchar
.isnumeric():
181 if u
"\u2160" <= char
<= u
"\u2188":
182 return deroman(string
)
185 while string
[:1] == u
"-" or string
[:1] == u
"+":
186 if string
[:1] == u
"-":
190 if not string
[:1].isnumeric():
193 string
= normalize_number(string
)
195 def _numeric(string
):
196 """Interpreter a number as base 10."""
199 number
= unicodedata
.numeric(uchar
)
200 if number
>= 1 or number
== 0:
206 whole
, frac
= string
.split(".")
207 whole
= _numeric(whole
)
208 frac
= _numeric(frac
) / (10.0 ** len(frac
))
209 return mult
* (whole
+ frac
)
211 return mult
* _numeric(string
)
213 def normalize_number(string
):
214 """Normalize punctuation in a number.
216 This function attempts to guess which characters in a number
217 represent grouping separators and which represent decimal
218 points. It returns a string that is valid to pass to Python's
219 float() routine (potentially, NaN, if nothing like a number is
224 string
= unicode(string
)
225 string
= filter(lambda u
: u
.isnumeric() or u
in KEEP_IN_NUMBERS
, string
)
226 string
= string
.strip(KEEP_IN_NUMBERS
)
228 commas
= string
.count(u
",")
229 stops
= string
.count(u
".")
230 quotes
= string
.count(u
"'")
232 # If anything occurs more than once, it's a separator.
234 string
= string
.replace(u
",", u
"")
237 string
= string
.replace(u
".", u
"")
240 string
= string
.replace(u
"'", u
"")
243 def normalize_two(a
, b
, string
):
244 """One of each - assume the first is grouping, second is point."""
245 a_idx
= string
.rindex(a
)
246 b_idx
= string
.rindex(b
)
248 string
= string
.replace(b
, u
"").replace(a
, u
".")
250 string
= string
.replace(a
, u
"").replace(b
, u
".")
253 if commas
and stops
and quotes
:
254 # If all three, assume the middle is the decimal point.
259 # Not really valid, so do whatever we want...
262 comma_idx
= string
.index(u
",")
263 stops_idx
= string
.index(u
".")
264 quotes_idx
= string
.index(u
"'")
265 if (comma_idx
< stops_idx
< quotes_idx
266 or quotes_idx
< stops_idx
< comma_idx
):
267 string
= string
.replace(u
",", u
"").replace(u
"'", u
"")
268 elif (comma_idx
< quotes_idx
< stops_idx
269 or stops_idx
< quotes_idx
< comma_idx
):
270 string
= string
.replace(
275 string
= string
.replace(
280 elif stops
and quotes
:
281 string
= normalize_two(u
".", u
"'", string
)
283 elif commas
and quotes
:
284 string
= normalize_two(u
",", u
"'", string
)
286 elif commas
and stops
:
287 string
= normalize_two(u
",", u
".", string
)
290 if string
[-4:-3] == u
"," and len(string
) <= 7:
291 # Single comma as a thousands separator.
292 string
= string
.replace(u
",", u
"")
294 # Single comma, not thousands - probably a decimal point.
295 string
= string
.replace(u
",", u
".")
298 # Single quote, probably MM'SS", equivalent to a decimal point.
299 string
= string
.replace(u
"'", u
".")
301 elif stops
and string
[-4:] == ".000":
302 # Single stop, but no decimal - probably grouping.
303 string
= string
.replace(u
".", u
"")
305 return string
or "NaN"
308 """Turn a Roman numeral into an integer."""
309 string
= unicodedata
.normalize('NFKD', unicode(string
)).lower()
312 for char
in reversed(string
):