1 """String utility functions for collation."""
3 __all__ = ["sortemes", "numeric", "normalize_number"]
5 import unicodedata
7 CONTINUE_ON = frozenset([
8 "Ll", "Lm", "Lo", "Lt", "Lu",
9 "Mc", "Me", "Mn",
10 "Nd", "Nl", "No",
11 "Po",
12 "Zs",
13 ])
15 UNKNOWN, LETTER, NUMBER = range(3)
17 BREAKER = u"\u2029" # Paragraph break character
18 INFINITY = float('inf')
20 KEEP_IN_NUMBERS = u"'.,"
21 ALLOWED_IN_NUMBERS = KEEP_IN_NUMBERS + u"_"
23 def stripends(word):
24 """Strip punctuation and symbols from the ends of a string."""
25 while word and unicodedata.category(word[0])[0] in "PS":
26 word = word[1:]
27 while word and unicodedata.category(word[-1])[0] in "PS":
28 word = word[:-1]
29 return word
31 def sortemes(string, key=lambda s: s):
32 """Generate a list of sortemes for the string.
34 A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
35 sort information. This is larger than a word boundry but smaller
36 than a sentence boundry; roughly, a sorteme boundry occurs between
37 letters and numbers, between numbers and numbers if 'too much'
38 punctuation exists in between, between lines.
40 There is no formal specification for sortemes; the goal of this
41 function is to provide good output for Collator.sortemekey.
43 """
45 words = []
46 letters = []
47 digits = []
48 if not string:
49 return words
50 string = unicode(string)
51 categories = map(unicodedata.category, string)
52 previous = UNKNOWN
54 def aletters(letters):
55 """Add a group of letters to the word list."""
56 words.append((INFINITY, stripends(letters)))
58 """Add a group of digits to the word list."""
59 words.append((numeric(digits), u''))
61 # TODO(jfw): This kind of evolved over time, there's probably a much
62 # faster / more concise way to express it now.
63 for i, (uchar, category) in enumerate(zip(string, categories)):
65 if letters and previous == LETTER and words:
66 word = stripends(words.pop()[1].strip()) + BREAKER
67 letters.insert(0, word)
68 previous = UNKNOWN
70 # Split at the first letter following a number or
71 # non-continuing character.
72 if category[0] == "L":
73 letters.append(uchar)
74 if digits:
76 digits = []
77 previous = NUMBER
79 # Split at the first number following a non-number or
80 # non-continuing character.
81 elif category[0] == "N":
82 digits.append(uchar)
83 if letters:
84 aletters(u"".join(letters))
85 letters = []
86 previous = LETTER
88 # Only certain punctuation allowed in numbers.
89 elif digits and uchar not in ALLOWED_IN_NUMBERS:
91 digits = []
92 previous = NUMBER
94 # Split if we find a non-continuing character ("weird" ones).
95 elif letters and category not in CONTINUE_ON:
96 if letters:
97 aletters(u"".join(letters).strip() + BREAKER)
98 letters = []
99 previous = LETTER
100 if digits:
102 digits = []
103 previous = NUMBER
105 # Split if we find two pieces of punctuation in a row, even
106 # if we should otherwise continue.
107 elif i and categories[i-1][0] in "P" and category[0] in "P":
108 if letters:
109 aletters(u"".join(letters))
110 letters = []
111 previous = LETTER
112 if digits:
114 digits = []
115 previous = NUMBER
117 else:
118 if digits:
119 digits.append(uchar)
120 elif letters:
121 letters.append(uchar)
123 if letters and previous == LETTER and words:
124 word = stripends(words.pop()[1].strip()) + BREAKER
125 letters.insert(0, word)
126 previous = UNKNOWN
128 if letters:
129 aletters(u"".join(letters))
130 if digits:
133 return [(i, key(w) if w else u'') for i, w in words]
135 def numeric(orig, invalid=INFINITY):
136 """Parse a number out of a string.
138 This function parses a unicode number out of the start of a
139 string. If a number cannot be found at the start, the 'invalid'
140 argument is returned.
142 """
144 if not orig:
145 return invalid
147 string = unicode(orig)
148 for uchar in string:
149 if uchar.isnumeric():
150 break
151 else:
152 return invalid
154 mult = 1
155 while string[:1] == u"-" or string[:1] == u"+":
156 if string[:1] == u"-":
157 mult = -mult
158 string = string[1:]
160 if not string[:1].isnumeric():
161 return invalid
163 string = normalize_number(string)
165 def _numeric(string):
166 """Interpreter a number as base 10."""
167 total = 0
168 for uchar in string:
169 number = unicodedata.numeric(uchar)
170 if number >= 1 or number == 0:
171 total *= 10
172 total += number
175 try:
176 whole, frac = string.split(".")
177 whole = _numeric(whole)
178 frac = _numeric(frac) / (10.0 ** len(frac))
179 return mult * (whole + frac)
180 except ValueError:
181 return mult * _numeric(string)
183 def normalize_number(string):
184 """Normalize punctuation in a number.
186 This function attempts to guess which characters in a number
187 represent grouping separators and which represent decimal
188 points. It returns a string that is valid to pass to Python's
189 float() routine (potentially, NaN, if nothing like a number is
190 found).
192 """
194 string = unicode(string)
195 string = filter(lambda u: u.isnumeric() or u in KEEP_IN_NUMBERS, string)
196 string = string.strip(KEEP_IN_NUMBERS)
198 commas = string.count(u",")
199 stops = string.count(u".")
200 quotes = string.count(u"'")
202 # If anything occurs more than once, it's a separator.
203 if commas > 1:
204 string = string.replace(u",", u"")
205 commas = 0
206 if stops > 1:
207 string = string.replace(u".", u"")
208 stops = 0
209 if quotes > 1:
210 string = string.replace(u"'", u"")
211 quotes = 0
213 def normalize_two(a, b, string):
214 """One of each - assume the first is grouping, second is point."""
215 a_idx = string.rindex(a)
216 b_idx = string.rindex(b)
217 if a_idx > b_idx:
218 string = string.replace(b, u"").replace(a, u".")
219 else:
220 string = string.replace(a, u"").replace(b, u".")
221 return string
223 if commas and stops and quotes:
224 # If all three, assume the middle is the decimal point.
225 # A,AAA.BB'CC
226 # A.AAA,BB'CC
227 # A,AAA'BB.CC
228 # A.AAA'BB,CC
229 # Not really valid, so do whatever we want...
230 # A'AAA.BB,CC
231 # A'AAA,BB.CC
232 comma_idx = string.index(u",")
233 stops_idx = string.index(u".")
234 quotes_idx = string.index(u"'")
235 if (comma_idx < stops_idx < quotes_idx
236 or quotes_idx < stops_idx < comma_idx):
237 string = string.replace(u",", u"").replace(u"'", u"")
238 elif (comma_idx < quotes_idx < stops_idx
239 or stops_idx < quotes_idx < comma_idx):
240 string = string.replace(
241 u",", u"").replace(
242 u".", u"").replace(
243 u"'", u".")
244 else:
245 string = string.replace(
246 u"'", u"").replace(
247 u".", u"").replace(
248 u",", u".")
250 elif stops and quotes:
251 string = normalize_two(u".", u"'", string)
253 elif commas and quotes:
254 string = normalize_two(u",", u"'", string)
256 elif commas and stops:
257 string = normalize_two(u",", u".", string)
259 elif commas:
260 if string[-4:-3] == u"," and len(string) <= 7:
261 # Single comma as a thousands separator.
262 string = string.replace(u",", u"")
263 else:
264 # Single comma, not thousands - probably a decimal point.
265 string = string.replace(u",", u".")
267 elif quotes:
268 # Single quote, probably MM'SS", equivalent to a decimal point.
269 string = string.replace(u"'", u".")
271 elif stops and string[-4:] == ".000":
272 # Single stop, but no decimal - probably grouping.
273 string = string.replace(u".", u"")
275 return string or "NaN"