1 """String utility functions for collation."""
3 __all__
= ["sortemes", "numeric", "normalize_number", "deroman"]
7 CONTINUE_ON
= frozenset([
8 "Ll", "Lm", "Lo", "Lt", "Lu",
15 UNKNOWN
, LETTER
, NUMBER
= range(3)
17 BREAKER
= u
"\u2028" # Line break character
18 HBREAKER
= u
"\u2029" # Paragraph break character
19 INFINITY
= float('inf')
21 KEEP_IN_NUMBERS
= u
"'.,"
22 ALLOWED_IN_NUMBERS
= KEEP_IN_NUMBERS
+ u
"_"
44 """Strip punctuation and symbols from the ends of a string."""
45 while word
and unicodedata
.category(word
[0])[0] in "PS":
47 while word
and unicodedata
.category(word
[-1])[0] in "PS":
51 def sortemes(string
, key
=lambda s
: s
):
52 """Generate a list of sortemes for the string.
54 A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
55 sort information. This is larger than a word boundry but smaller
56 than a sentence boundry; roughly, a sorteme boundry occurs between
57 letters and numbers, between numbers and numbers if 'too much'
58 punctuation exists in between, between lines.
60 There is no formal specification for sortemes; the goal of this
61 function is to provide good output for Collator.sortemekey.
70 string
= unicode(string
)
71 categories
= map(unicodedata
.category
, string
)
74 def aletters(letters
):
75 """Add a group of letters to the word list."""
76 words
.append((INFINITY
, stripends(letters
)))
78 """Add a group of digits to the word list."""
79 words
.append((numeric(digits
), u
''))
81 # TODO(jfw): This kind of evolved over time, there's probably a much
82 # faster / more concise way to express it now.
83 for i
, (uchar
, category
) in enumerate(zip(string
, categories
)):
85 if letters
and previous
== LETTER
and words
:
86 word
= stripends(words
.pop()[1].strip()) + BREAKER
87 letters
.insert(0, word
)
90 # Split at the first letter following a number or
91 # non-continuing character.
92 if category
[0] == "L":
95 adigits(u
"".join(digits
).strip())
99 # Split at the first number following a non-number or
100 # non-continuing character.
101 elif category
[0] == "N":
104 if unicodedata
.category(letters
[-1])[0] == "L":
105 letters
.append(HBREAKER
)
106 aletters(u
"".join(letters
))
110 # Only certain punctuation allowed in numbers.
111 elif digits
and uchar
not in ALLOWED_IN_NUMBERS
:
112 adigits(u
"".join(digits
))
116 # Split if we find a non-continuing character ("weird" ones).
117 elif letters
and category
not in CONTINUE_ON
:
119 aletters(u
"".join(letters
).strip() + BREAKER
)
123 adigits(u
"".join(digits
).strip())
127 # Split if we find two pieces of punctuation in a row, even
128 # if we should otherwise continue.
129 elif i
and categories
[i
-1][0] in "P" and category
[0] in "P":
131 aletters(u
"".join(letters
))
135 adigits(u
"".join(digits
))
143 letters
.append(uchar
)
145 if letters
and previous
== LETTER
and words
:
146 word
= stripends(words
.pop()[1].strip()) + BREAKER
147 letters
.insert(0, word
)
151 aletters(u
"".join(letters
))
153 adigits(u
"".join(digits
))
155 return [(i
, key(w
) if w
else u
'') for i
, w
in words
]
157 def numeric(orig
, invalid
=INFINITY
):
158 """Parse a number out of a string.
160 This function parses a unicode number out of the start of a
161 string. If a number cannot be found at the start, the 'invalid'
162 argument is returned.
169 string
= unicode(orig
)
171 if uchar
.isnumeric():
177 if u
"\u2160" <= char
<= u
"\u2188":
178 return deroman(string
)
181 while string
[:1] == u
"-" or string
[:1] == u
"+":
182 if string
[:1] == u
"-":
186 if not string
[:1].isnumeric():
189 string
= normalize_number(string
)
191 def _numeric(string
):
192 """Interpreter a number as base 10."""
195 number
= unicodedata
.numeric(uchar
)
196 if number
>= 1 or number
== 0:
202 whole
, frac
= string
.split(".")
203 whole
= _numeric(whole
)
204 frac
= _numeric(frac
) / (10.0 ** len(frac
))
205 return mult
* (whole
+ frac
)
207 return mult
* _numeric(string
)
209 def normalize_number(string
):
210 """Normalize punctuation in a number.
212 This function attempts to guess which characters in a number
213 represent grouping separators and which represent decimal
214 points. It returns a string that is valid to pass to Python's
215 float() routine (potentially, NaN, if nothing like a number is
220 string
= unicode(string
)
221 string
= filter(lambda u
: u
.isnumeric() or u
in KEEP_IN_NUMBERS
, string
)
222 string
= string
.strip(KEEP_IN_NUMBERS
)
224 commas
= string
.count(u
",")
225 stops
= string
.count(u
".")
226 quotes
= string
.count(u
"'")
228 # If anything occurs more than once, it's a separator.
230 string
= string
.replace(u
",", u
"")
233 string
= string
.replace(u
".", u
"")
236 string
= string
.replace(u
"'", u
"")
239 def normalize_two(a
, b
, string
):
240 """One of each - assume the first is grouping, second is point."""
241 a_idx
= string
.rindex(a
)
242 b_idx
= string
.rindex(b
)
244 string
= string
.replace(b
, u
"").replace(a
, u
".")
246 string
= string
.replace(a
, u
"").replace(b
, u
".")
249 if commas
and stops
and quotes
:
250 # If all three, assume the middle is the decimal point.
255 # Not really valid, so do whatever we want...
258 comma_idx
= string
.index(u
",")
259 stops_idx
= string
.index(u
".")
260 quotes_idx
= string
.index(u
"'")
261 if (comma_idx
< stops_idx
< quotes_idx
262 or quotes_idx
< stops_idx
< comma_idx
):
263 string
= string
.replace(u
",", u
"").replace(u
"'", u
"")
264 elif (comma_idx
< quotes_idx
< stops_idx
265 or stops_idx
< quotes_idx
< comma_idx
):
266 string
= string
.replace(
271 string
= string
.replace(
276 elif stops
and quotes
:
277 string
= normalize_two(u
".", u
"'", string
)
279 elif commas
and quotes
:
280 string
= normalize_two(u
",", u
"'", string
)
282 elif commas
and stops
:
283 string
= normalize_two(u
",", u
".", string
)
286 if string
[-4:-3] == u
"," and len(string
) <= 7:
287 # Single comma as a thousands separator.
288 string
= string
.replace(u
",", u
"")
290 # Single comma, not thousands - probably a decimal point.
291 string
= string
.replace(u
",", u
".")
294 # Single quote, probably MM'SS", equivalent to a decimal point.
295 string
= string
.replace(u
"'", u
".")
297 elif stops
and string
[-4:] == ".000":
298 # Single stop, but no decimal - probably grouping.
299 string
= string
.replace(u
".", u
"")
301 return string
or "NaN"
304 """Turn a Roman numeral into an integer."""
305 string
= unicodedata
.normalize('NFKD', unicode(string
)).lower()
308 for char
in reversed(string
):