487257ffe6e8a02dbdde8b9be669a25d1a976ba5
1 """String utility functions for collation."""
3 __all__
= ["sortemes", "numeric", "normalize_number", "deroman"]
7 CONTINUE_ON
= frozenset([
8 "Ll", "Lm", "Lo", "Lt", "Lu",
15 UNKNOWN
, LETTER
, NUMBER
= range(3)
17 BREAKER
= u
"\u2028" # Line break character
18 HBREAKER
= u
"\u2029" # Paragraph break character
19 INFINITY
= float('inf')
21 KEEP_IN_NUMBERS
= u
"'.,"
22 ALLOWED_IN_NUMBERS
= KEEP_IN_NUMBERS
+ u
"_"
44 """Strip punctuation and symbols from the ends of a string."""
45 while word
and unicodedata
.category(word
[0])[0] in "PS":
47 while word
and unicodedata
.category(word
[-1])[0] in "PS":
51 def sortemes(string
, key
=lambda s
: s
):
52 """Generate a list of sortemes for the string.
54 A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
55 sort information. This is larger than a word boundry but smaller
56 than a sentence boundry; roughly, a sorteme boundry occurs between
57 letters and numbers, between numbers and numbers if 'too much'
58 punctuation exists in between, between lines.
60 There is no formal specification for sortemes; the goal of this
61 function is to provide good output for Collator.sortemekey.
71 lappend
= letters
.append
72 dappend
= digits
.append
73 string
= unicode(string
)
74 categories
= map(unicodedata
.category
, string
)
76 wappend
= words
.append
81 category
= categories
[i
]
83 if letters
and previous
== LETTER
and words
:
84 word
= stripends(words
.pop()[1].strip()) + BREAKER
85 letters
.insert(0, word
)
88 # Split at the first letter following a number or
89 # non-continuing character.
90 if category
[0] == "L":
93 words
.append((numeric(join(digits
).strip()), u
''))
97 # Split at the first number following a non-number or
98 # non-continuing character.
99 elif category
[0] == "N":
102 if unicodedata
.category(letters
[-1])[0] == "L":
104 wappend((INFINITY
, stripends(join(letters
))))
108 # Only certain punctuation allowed in numbers.
109 elif digits
and uchar
not in ALLOWED_IN_NUMBERS
:
110 words
.append((numeric(join(digits
)), u
''))
114 # Split if we find a non-continuing character ("weird" ones).
115 elif category
not in CONTINUE_ON
:
119 stripends(join(letters
).strip() + BREAKER
)))
123 words
.append((numeric(join(digits
)), u
''))
127 # Split if we find two pieces of punctuation in a row, even
128 # if we should otherwise continue.
129 elif i
and categories
[i
- 1][0] == category
[0] == "P":
131 wappend((INFINITY
, stripends(join(letters
))))
135 words
.append((numeric(join(digits
)), u
''))
147 if letters
and previous
== LETTER
and words
:
148 word
= stripends(words
.pop()[1].strip()) + BREAKER
149 letters
.insert(0, word
)
153 wappend((INFINITY
, stripends(join(letters
))))
155 words
.append((numeric(join(digits
)), u
''))
157 return [(i
, key(w
)) for i
, w
in words
]
159 def numeric(orig
, invalid
=INFINITY
):
160 """Parse a number out of a string.
162 This function parses a unicode number out of the start of a
163 string. If a number cannot be found at the start, the 'invalid'
164 argument is returned.
171 string
= unicode(orig
)
173 if uchar
.isnumeric():
179 if u
"\u2160" <= char
<= u
"\u2188":
180 return deroman(string
)
183 while string
[:1] == u
"-" or string
[:1] == u
"+":
184 if string
[:1] == u
"-":
188 if not string
[:1].isnumeric():
191 string
= normalize_number(string
)
193 def _numeric(string
):
194 """Interpreter a number as base 10."""
197 number
= unicodedata
.numeric(uchar
)
198 if number
>= 1 or number
== 0:
204 whole
, frac
= string
.split(".")
205 whole
= _numeric(whole
)
206 frac
= _numeric(frac
) / (10.0 ** len(frac
))
207 return mult
* (whole
+ frac
)
209 return mult
* _numeric(string
)
211 def normalize_number(string
):
212 """Normalize punctuation in a number.
214 This function attempts to guess which characters in a number
215 represent grouping separators and which represent decimal
216 points. It returns a string that is valid to pass to Python's
217 float() routine (potentially, NaN, if nothing like a number is
222 string
= unicode(string
)
223 string
= filter(lambda u
: u
.isnumeric() or u
in KEEP_IN_NUMBERS
, string
)
224 string
= string
.strip(KEEP_IN_NUMBERS
)
226 commas
= string
.count(u
",")
227 stops
= string
.count(u
".")
228 quotes
= string
.count(u
"'")
230 # If anything occurs more than once, it's a separator.
232 string
= string
.replace(u
",", u
"")
235 string
= string
.replace(u
".", u
"")
238 string
= string
.replace(u
"'", u
"")
241 def normalize_two(a
, b
, string
):
242 """One of each - assume the first is grouping, second is point."""
243 a_idx
= string
.rindex(a
)
244 b_idx
= string
.rindex(b
)
246 string
= string
.replace(b
, u
"").replace(a
, u
".")
248 string
= string
.replace(a
, u
"").replace(b
, u
".")
251 if commas
and stops
and quotes
:
252 # If all three, assume the middle is the decimal point.
257 # Not really valid, so do whatever we want...
260 comma_idx
= string
.index(u
",")
261 stops_idx
= string
.index(u
".")
262 quotes_idx
= string
.index(u
"'")
263 if (comma_idx
< stops_idx
< quotes_idx
264 or quotes_idx
< stops_idx
< comma_idx
):
265 string
= string
.replace(u
",", u
"").replace(u
"'", u
"")
266 elif (comma_idx
< quotes_idx
< stops_idx
267 or stops_idx
< quotes_idx
< comma_idx
):
268 string
= string
.replace(
273 string
= string
.replace(
278 elif stops
and quotes
:
279 string
= normalize_two(u
".", u
"'", string
)
281 elif commas
and quotes
:
282 string
= normalize_two(u
",", u
"'", string
)
284 elif commas
and stops
:
285 string
= normalize_two(u
",", u
".", string
)
288 if string
[-4:-3] == u
"," and len(string
) <= 7:
289 # Single comma as a thousands separator.
290 string
= string
.replace(u
",", u
"")
292 # Single comma, not thousands - probably a decimal point.
293 string
= string
.replace(u
",", u
".")
296 # Single quote, probably MM'SS", equivalent to a decimal point.
297 string
= string
.replace(u
"'", u
".")
299 elif stops
and string
[-4:] == ".000":
300 # Single stop, but no decimal - probably grouping.
301 string
= string
.replace(u
".", u
"")
303 return string
or "NaN"
306 """Turn a Roman numeral into an integer."""
307 string
= unicodedata
.normalize('NFKD', unicode(string
)).lower()
310 for char
in reversed(string
):