60685c57a3415e763a63d05545340429113ec222
1 """String utility functions for collation."""
3 __all__
= ["sortemes", "numeric", "normalize_number"]
7 CONTINUE_ON
= frozenset([
8 "Ll", "Lm", "Lo", "Lt", "Lu",
15 UNKNOWN
, LETTER
, NUMBER
= range(3)
17 BREAKER
= u
"\u2028" # Line break character
18 HBREAKER
= u
"\u2029" # Paragraph break character
19 INFINITY
= float('inf')
21 KEEP_IN_NUMBERS
= u
"'.,"
22 ALLOWED_IN_NUMBERS
= KEEP_IN_NUMBERS
+ u
"_"
25 """Strip punctuation and symbols from the ends of a string."""
26 while word
and unicodedata
.category(word
[0])[0] in "PS":
28 while word
and unicodedata
.category(word
[-1])[0] in "PS":
32 def sortemes(string
, key
=lambda s
: s
):
33 """Generate a list of sortemes for the string.
35 A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
36 sort information. This is larger than a word boundry but smaller
37 than a sentence boundry; roughly, a sorteme boundry occurs between
38 letters and numbers, between numbers and numbers if 'too much'
39 punctuation exists in between, between lines.
41 There is no formal specification for sortemes; the goal of this
42 function is to provide good output for Collator.sortemekey.
51 string
= unicode(string
)
52 categories
= map(unicodedata
.category
, string
)
55 def aletters(letters
):
56 """Add a group of letters to the word list."""
57 words
.append((INFINITY
, stripends(letters
)))
59 """Add a group of digits to the word list."""
60 words
.append((numeric(digits
), u
''))
62 # TODO(jfw): This kind of evolved over time, there's probably a much
63 # faster / more concise way to express it now.
64 for i
, (uchar
, category
) in enumerate(zip(string
, categories
)):
66 if letters
and previous
== LETTER
and words
:
67 word
= stripends(words
.pop()[1].strip()) + BREAKER
68 letters
.insert(0, word
)
71 # Split at the first letter following a number or
72 # non-continuing character.
73 if category
[0] == "L":
76 adigits(u
"".join(digits
).strip())
80 # Split at the first number following a non-number or
81 # non-continuing character.
82 elif category
[0] == "N":
85 if unicodedata
.category(letters
[-1])[0] == "L":
86 letters
.append(HBREAKER
)
87 aletters(u
"".join(letters
))
91 # Only certain punctuation allowed in numbers.
92 elif digits
and uchar
not in ALLOWED_IN_NUMBERS
:
93 adigits(u
"".join(digits
))
97 # Split if we find a non-continuing character ("weird" ones).
98 elif letters
and category
not in CONTINUE_ON
:
100 aletters(u
"".join(letters
).strip() + BREAKER
)
104 adigits(u
"".join(digits
).strip())
108 # Split if we find two pieces of punctuation in a row, even
109 # if we should otherwise continue.
110 elif i
and categories
[i
-1][0] in "P" and category
[0] in "P":
112 aletters(u
"".join(letters
))
116 adigits(u
"".join(digits
))
124 letters
.append(uchar
)
126 if letters
and previous
== LETTER
and words
:
127 word
= stripends(words
.pop()[1].strip()) + BREAKER
128 letters
.insert(0, word
)
132 aletters(u
"".join(letters
))
134 adigits(u
"".join(digits
))
136 return [(i
, key(w
) if w
else u
'') for i
, w
in words
]
138 def numeric(orig
, invalid
=INFINITY
):
139 """Parse a number out of a string.
141 This function parses a unicode number out of the start of a
142 string. If a number cannot be found at the start, the 'invalid'
143 argument is returned.
150 string
= unicode(orig
)
152 if uchar
.isnumeric():
158 while string
[:1] == u
"-" or string
[:1] == u
"+":
159 if string
[:1] == u
"-":
163 if not string
[:1].isnumeric():
166 string
= normalize_number(string
)
168 def _numeric(string
):
169 """Interpreter a number as base 10."""
172 number
= unicodedata
.numeric(uchar
)
173 if number
>= 1 or number
== 0:
179 whole
, frac
= string
.split(".")
180 whole
= _numeric(whole
)
181 frac
= _numeric(frac
) / (10.0 ** len(frac
))
182 return mult
* (whole
+ frac
)
184 return mult
* _numeric(string
)
186 def normalize_number(string
):
187 """Normalize punctuation in a number.
189 This function attempts to guess which characters in a number
190 represent grouping separators and which represent decimal
191 points. It returns a string that is valid to pass to Python's
192 float() routine (potentially, NaN, if nothing like a number is
197 string
= unicode(string
)
198 string
= filter(lambda u
: u
.isnumeric() or u
in KEEP_IN_NUMBERS
, string
)
199 string
= string
.strip(KEEP_IN_NUMBERS
)
201 commas
= string
.count(u
",")
202 stops
= string
.count(u
".")
203 quotes
= string
.count(u
"'")
205 # If anything occurs more than once, it's a separator.
207 string
= string
.replace(u
",", u
"")
210 string
= string
.replace(u
".", u
"")
213 string
= string
.replace(u
"'", u
"")
216 def normalize_two(a
, b
, string
):
217 """One of each - assume the first is grouping, second is point."""
218 a_idx
= string
.rindex(a
)
219 b_idx
= string
.rindex(b
)
221 string
= string
.replace(b
, u
"").replace(a
, u
".")
223 string
= string
.replace(a
, u
"").replace(b
, u
".")
226 if commas
and stops
and quotes
:
227 # If all three, assume the middle is the decimal point.
232 # Not really valid, so do whatever we want...
235 comma_idx
= string
.index(u
",")
236 stops_idx
= string
.index(u
".")
237 quotes_idx
= string
.index(u
"'")
238 if (comma_idx
< stops_idx
< quotes_idx
239 or quotes_idx
< stops_idx
< comma_idx
):
240 string
= string
.replace(u
",", u
"").replace(u
"'", u
"")
241 elif (comma_idx
< quotes_idx
< stops_idx
242 or stops_idx
< quotes_idx
< comma_idx
):
243 string
= string
.replace(
248 string
= string
.replace(
253 elif stops
and quotes
:
254 string
= normalize_two(u
".", u
"'", string
)
256 elif commas
and quotes
:
257 string
= normalize_two(u
",", u
"'", string
)
259 elif commas
and stops
:
260 string
= normalize_two(u
",", u
".", string
)
263 if string
[-4:-3] == u
"," and len(string
) <= 7:
264 # Single comma as a thousands separator.
265 string
= string
.replace(u
",", u
"")
267 # Single comma, not thousands - probably a decimal point.
268 string
= string
.replace(u
",", u
".")
271 # Single quote, probably MM'SS", equivalent to a decimal point.
272 string
= string
.replace(u
"'", u
".")
274 elif stops
and string
[-4:] == ".000":
275 # Single stop, but no decimal - probably grouping.
276 string
= string
.replace(u
".", u
"")
278 return string
or "NaN"