8d6af993cd72d5f0fcb4fc04eb6e1953f72904c9
1 """String utility functions for collation."""
3 __all__
= ["sortemes", "numeric", "normalize_number"]
7 CONTINUE_ON
= frozenset([
8 "Ll", "Lm", "Lo", "Lt", "Lu",
15 UNKNOWN
, LETTER
, NUMBER
= range(3)
17 BREAKER
= u
"\u2029" # Paragraph break character
18 INFINITY
= float('inf')
20 KEEP_IN_NUMBERS
= u
"'.,"
21 ALLOWED_IN_NUMBERS
= KEEP_IN_NUMBERS
+ u
"_"
24 """Strip punctuation and symbols from the ends of a string."""
25 while word
and unicodedata
.category(word
[0])[0] in "PS":
27 while word
and unicodedata
.category(word
[-1])[0] in "PS":
31 def sortemes(string
, key
=lambda s
: s
):
32 """Generate a list of sortemes for the string.
34 A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
35 sort information. This is larger than a word boundry but smaller
36 than a sentence boundry; roughly, a sorteme boundry occurs between
37 letters and numbers, between numbers and numbers if 'too much'
38 punctuation exists in between, between lines.
40 There is no formal specification for sortemes; the goal of this
41 function is to provide good output for Collator.sortemekey.
50 string
= unicode(string
)
51 categories
= map(unicodedata
.category
, string
)
54 def aletters(letters
):
55 """Add a group of letters to the word list."""
56 words
.append((INFINITY
, stripends(letters
)))
58 """Add a group of digits to the word list."""
59 words
.append((numeric(digits
), u
''))
61 # TODO(jfw): This kind of evolved over time, there's probably a much
62 # faster / more concise way to express it now.
63 for i
, (uchar
, category
) in enumerate(zip(string
, categories
)):
65 if letters
and previous
== LETTER
and words
:
66 word
= stripends(words
.pop()[1].strip()) + BREAKER
67 letters
.insert(0, word
)
70 # Split at the first letter following a number or
71 # non-continuing character.
72 if category
[0] == "L":
75 adigits(u
"".join(digits
).strip())
79 # Split at the first number following a non-number or
80 # non-continuing character.
81 elif category
[0] == "N":
84 aletters(u
"".join(letters
))
88 # Only certain punctuation allowed in numbers.
89 elif digits
and uchar
not in ALLOWED_IN_NUMBERS
:
90 adigits(u
"".join(digits
))
94 # Split if we find a non-continuing character ("weird" ones).
95 elif letters
and category
not in CONTINUE_ON
:
97 aletters(u
"".join(letters
).strip() + BREAKER
)
101 adigits(u
"".join(digits
).strip())
105 # Split if we find two pieces of punctuation in a row, even
106 # if we should otherwise continue.
107 elif i
and categories
[i
-1][0] in "P" and category
[0] in "P":
109 aletters(u
"".join(letters
))
113 adigits(u
"".join(digits
))
121 letters
.append(uchar
)
123 if letters
and previous
== LETTER
and words
:
124 word
= stripends(words
.pop()[1].strip()) + BREAKER
125 letters
.insert(0, word
)
129 aletters(u
"".join(letters
))
131 adigits(u
"".join(digits
))
133 return [(i
, key(w
) if w
else u
'') for i
, w
in words
]
135 def numeric(orig
, invalid
=INFINITY
):
136 """Parse a number out of a string.
138 This function parses a unicode number out of the start of a
139 string. If a number cannot be found at the start, the 'invalid'
140 argument is returned.
147 string
= unicode(orig
)
149 if uchar
.isnumeric():
155 while string
[:1] == u
"-" or string
[:1] == u
"+":
156 if string
[:1] == u
"-":
160 if not string
[:1].isnumeric():
163 string
= normalize_number(string
)
165 def _numeric(string
):
166 """Interpreter a number as base 10."""
169 number
= unicodedata
.numeric(uchar
)
170 if number
>= 1 or number
== 0:
176 whole
, frac
= string
.split(".")
177 whole
= _numeric(whole
)
178 frac
= _numeric(frac
) / (10.0 ** len(frac
))
179 return mult
* (whole
+ frac
)
181 return mult
* _numeric(string
)
183 def normalize_number(string
):
184 """Normalize punctuation in a number.
186 This function attempts to guess which characters in a number
187 represent grouping separators and which represent decimal
188 points. It returns a string that is valid to pass to Python's
189 float() routine (potentially, NaN, if nothing like a number is
194 string
= unicode(string
)
195 string
= filter(lambda u
: u
.isnumeric() or u
in KEEP_IN_NUMBERS
, string
)
196 string
= string
.strip(KEEP_IN_NUMBERS
)
198 commas
= string
.count(u
",")
199 stops
= string
.count(u
".")
200 quotes
= string
.count(u
"'")
202 # If anything occurs more than once, it's a separator.
204 string
= string
.replace(u
",", u
"")
207 string
= string
.replace(u
".", u
"")
210 string
= string
.replace(u
"'", u
"")
213 def normalize_two(a
, b
, string
):
214 """One of each - assume the first is grouping, second is point."""
215 a_idx
= string
.rindex(a
)
216 b_idx
= string
.rindex(b
)
218 string
= string
.replace(b
, u
"").replace(a
, u
".")
220 string
= string
.replace(a
, u
"").replace(b
, u
".")
223 if commas
and stops
and quotes
:
224 # If all three, assume the middle is the decimal point.
229 # Not really valid, so do whatever we want...
232 comma_idx
= string
.index(u
",")
233 stops_idx
= string
.index(u
".")
234 quotes_idx
= string
.index(u
"'")
235 if (comma_idx
< stops_idx
< quotes_idx
236 or quotes_idx
< stops_idx
< comma_idx
):
237 string
= string
.replace(u
",", u
"").replace(u
"'", u
"")
238 elif (comma_idx
< quotes_idx
< stops_idx
239 or stops_idx
< quotes_idx
< comma_idx
):
240 string
= string
.replace(
245 string
= string
.replace(
250 elif stops
and quotes
:
251 string
= normalize_two(u
".", u
"'", string
)
253 elif commas
and quotes
:
254 string
= normalize_two(u
",", u
"'", string
)
256 elif commas
and stops
:
257 string
= normalize_two(u
",", u
".", string
)
260 if string
[-4:-3] == u
"," and len(string
) <= 7:
261 # Single comma as a thousands separator.
262 string
= string
.replace(u
",", u
"")
264 # Single comma, not thousands - probably a decimal point.
265 string
= string
.replace(u
",", u
".")
268 # Single quote, probably MM'SS", equivalent to a decimal point.
269 string
= string
.replace(u
"'", u
".")
271 elif stops
and string
[-4:] == ".000":
272 # Single stop, but no decimal - probably grouping.
273 string
= string
.replace(u
".", u
"")
275 return string
or "NaN"