16ba8be7328aabdbd1246936f32aafa1a6057c72
[python-collate.git] / collate / strings.py
1 """String utility functions for collation."""
2
3 __all__ = ["sortemes", "numeric", "normalize_number", "deroman"]
4
5 import unicodedata
6
7 CONTINUE_ON = frozenset([
8 "Ll", "Lm", "Lo", "Lt", "Lu",
9 "Mc", "Me", "Mn",
10 "Nd", "Nl", "No",
11 "Po",
12 "Zs",
13 ])
14
15 UNKNOWN, LETTER, NUMBER = range(3)
16
17 BREAKER = u"\u2028" # Line break character
18 HBREAKER = u"\u2029" # Paragraph break character
19 INFINITY = float('inf')
20
21 KEEP_IN_NUMBERS = u"'.,"
22 ALLOWED_IN_NUMBERS = KEEP_IN_NUMBERS + u"_"
23
24 ROMAN = {
25 u"i": 1,
26 u"v": 5,
27 u"x": 10,
28 u"l": 50,
29 u"c": 100,
30 u"d": 500,
31 u"m": 1000,
32 u"\u2180": 1000,
33 u"\u2181": 5000,
34 u"\u2182": 10000,
35 u"\u2183": 100,
36 u"\u2184": 100,
37 u"\u2185": 6,
38 u"\u2186": 50,
39 u"\u2187": 50000,
40 u"\u2188": 100000,
41 }
42
43 INITIAL_STOPS = frozenset([u"a", u"an", u"the"])
44
45 def stripends(word):
46 """Strip punctuation and symbols from the ends of a string."""
47 while word and unicodedata.category(word[0])[0] in "PS":
48 word = word[1:]
49 while word and unicodedata.category(word[-1])[0] in "PS":
50 word = word[:-1]
51 return word
52
53 def sortemes(string, key=lambda s: s):
54 """Generate a list of sortemes for the string.
55
56 A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
57 sort information. This is larger than a word boundry but smaller
58 than a sentence boundry; roughly, a sorteme boundry occurs between
59 letters and numbers, between numbers and numbers if 'too much'
60 punctuation exists in between, between lines.
61
62 There is no formal specification for sortemes; the goal of this
63 function is to provide good output for Collator.sortemekey.
64
65 """
66
67 if not string:
68 return []
69
70 words = []
71 letters = []
72 digits = []
73 lappend = letters.append
74 dappend = digits.append
75 string = unicode(string)
76 categories = map(unicodedata.category, string)
77 previous = UNKNOWN
78 wappend = words.append
79 join = u"".join
80 i = 0
81
82 for uchar in string:
83 category = categories[i]
84
85 if letters and previous == LETTER and words:
86 word = stripends(words.pop()[1].strip()) + BREAKER
87 letters.insert(0, word)
88 previous = UNKNOWN
89
90 # Split at the first letter following a number or
91 # non-continuing character.
92 if category[0] == "L":
93 lappend(uchar)
94 if digits:
95 words.append((numeric(join(digits).strip()), u''))
96 del(digits[:])
97 previous = NUMBER
98
99 # Split at the first number following a non-number or
100 # non-continuing character.
101 elif category[0] == "N":
102 dappend(uchar)
103 if letters:
104 if unicodedata.category(letters[-1])[0] == "L":
105 lappend(HBREAKER)
106 wappend((INFINITY, stripends(join(letters))))
107 del(letters[:])
108 previous = LETTER
109
110 # Only certain punctuation allowed in numbers.
111 elif digits and uchar not in ALLOWED_IN_NUMBERS:
112 words.append((numeric(join(digits)), u''))
113 del(digits[:])
114 previous = NUMBER
115
116 # Split if we find a non-continuing character ("weird" ones).
117 elif category not in CONTINUE_ON:
118 if letters:
119 wappend(
120 (INFINITY,
121 stripends(join(letters).strip() + BREAKER)))
122 del(letters[:])
123 previous = LETTER
124 if digits:
125 words.append((numeric(join(digits)), u''))
126 del(digits[:])
127 previous = NUMBER
128
129 # Split if we find two pieces of punctuation in a row, even
130 # if we should otherwise continue.
131 elif i and categories[i - 1][0] == category[0] == "P":
132 if letters:
133 wappend((INFINITY, stripends(join(letters))))
134 del(letters[:])
135 previous = LETTER
136 if digits:
137 words.append((numeric(join(digits)), u''))
138 del(digits[:])
139 previous = NUMBER
140
141 else:
142 if digits:
143 dappend(uchar)
144 elif letters:
145 lappend(uchar)
146
147 i += 1
148
149 if letters and previous == LETTER and words:
150 word = stripends(words.pop()[1].strip()) + BREAKER
151 letters.insert(0, word)
152 previous = UNKNOWN
153
154 if letters:
155 wappend((INFINITY, stripends(join(letters))))
156 if digits:
157 words.append((numeric(join(digits)), u''))
158
159 return [(i, key(w)) for i, w in words]
160
161 def numeric(orig, invalid=INFINITY):
162 """Parse a number out of a string.
163
164 This function parses a unicode number out of the start of a
165 string. If a number cannot be found at the start, the 'invalid'
166 argument is returned.
167
168 """
169
170 if not orig:
171 return invalid
172
173 string = unicode(orig)
174 for uchar in string:
175 if uchar.isnumeric():
176 break
177 else:
178 return invalid
179
180 for char in string:
181 if u"\u2160" <= char <= u"\u2188":
182 return deroman(string)
183
184 mult = 1
185 while string[:1] == u"-" or string[:1] == u"+":
186 if string[:1] == u"-":
187 mult = -mult
188 string = string[1:]
189
190 if not string[:1].isnumeric():
191 return invalid
192
193 string = normalize_number(string)
194
195 def _numeric(string):
196 """Interpreter a number as base 10."""
197 total = 0
198 for uchar in string:
199 number = unicodedata.numeric(uchar)
200 if number >= 1 or number == 0:
201 total *= 10
202 total += number
203 return total
204
205 try:
206 whole, frac = string.split(".")
207 whole = _numeric(whole)
208 frac = _numeric(frac) / (10.0 ** len(frac))
209 return mult * (whole + frac)
210 except ValueError:
211 return mult * _numeric(string)
212
213 def normalize_number(string):
214 """Normalize punctuation in a number.
215
216 This function attempts to guess which characters in a number
217 represent grouping separators and which represent decimal
218 points. It returns a string that is valid to pass to Python's
219 float() routine (potentially, NaN, if nothing like a number is
220 found).
221
222 """
223
224 string = unicode(string)
225 string = filter(lambda u: u.isnumeric() or u in KEEP_IN_NUMBERS, string)
226 string = string.strip(KEEP_IN_NUMBERS)
227
228 commas = string.count(u",")
229 stops = string.count(u".")
230 quotes = string.count(u"'")
231
232 # If anything occurs more than once, it's a separator.
233 if commas > 1:
234 string = string.replace(u",", u"")
235 commas = 0
236 if stops > 1:
237 string = string.replace(u".", u"")
238 stops = 0
239 if quotes > 1:
240 string = string.replace(u"'", u"")
241 quotes = 0
242
243 def normalize_two(a, b, string):
244 """One of each - assume the first is grouping, second is point."""
245 a_idx = string.rindex(a)
246 b_idx = string.rindex(b)
247 if a_idx > b_idx:
248 string = string.replace(b, u"").replace(a, u".")
249 else:
250 string = string.replace(a, u"").replace(b, u".")
251 return string
252
253 if commas and stops and quotes:
254 # If all three, assume the middle is the decimal point.
255 # A,AAA.BB'CC
256 # A.AAA,BB'CC
257 # A,AAA'BB.CC
258 # A.AAA'BB,CC
259 # Not really valid, so do whatever we want...
260 # A'AAA.BB,CC
261 # A'AAA,BB.CC
262 comma_idx = string.index(u",")
263 stops_idx = string.index(u".")
264 quotes_idx = string.index(u"'")
265 if (comma_idx < stops_idx < quotes_idx
266 or quotes_idx < stops_idx < comma_idx):
267 string = string.replace(u",", u"").replace(u"'", u"")
268 elif (comma_idx < quotes_idx < stops_idx
269 or stops_idx < quotes_idx < comma_idx):
270 string = string.replace(
271 u",", u"").replace(
272 u".", u"").replace(
273 u"'", u".")
274 else:
275 string = string.replace(
276 u"'", u"").replace(
277 u".", u"").replace(
278 u",", u".")
279
280 elif stops and quotes:
281 string = normalize_two(u".", u"'", string)
282
283 elif commas and quotes:
284 string = normalize_two(u",", u"'", string)
285
286 elif commas and stops:
287 string = normalize_two(u",", u".", string)
288
289 elif commas:
290 if string[-4:-3] == u"," and len(string) <= 7:
291 # Single comma as a thousands separator.
292 string = string.replace(u",", u"")
293 else:
294 # Single comma, not thousands - probably a decimal point.
295 string = string.replace(u",", u".")
296
297 elif quotes:
298 # Single quote, probably MM'SS", equivalent to a decimal point.
299 string = string.replace(u"'", u".")
300
301 elif stops and string[-4:] == ".000":
302 # Single stop, but no decimal - probably grouping.
303 string = string.replace(u".", u"")
304
305 return string or "NaN"
306
307 def deroman(string):
308 """Turn a Roman numeral into an integer."""
309 string = unicodedata.normalize('NFKD', unicode(string)).lower()
310 previous = 0
311 building = 0
312 for char in reversed(string):
313 try:
314 value = ROMAN[char]
315 except KeyError:
316 continue
317 if value < previous:
318 building -= value
319 else:
320 building += value
321 previous = value
322 return building