487257ffe6e8a02dbdde8b9be669a25d1a976ba5
[python-collate.git] / collate / strings.py
1 """String utility functions for collation."""
2
3 __all__ = ["sortemes", "numeric", "normalize_number", "deroman"]
4
5 import unicodedata
6
7 CONTINUE_ON = frozenset([
8 "Ll", "Lm", "Lo", "Lt", "Lu",
9 "Mc", "Me", "Mn",
10 "Nd", "Nl", "No",
11 "Po",
12 "Zs",
13 ])
14
15 UNKNOWN, LETTER, NUMBER = range(3)
16
17 BREAKER = u"\u2028" # Line break character
18 HBREAKER = u"\u2029" # Paragraph break character
19 INFINITY = float('inf')
20
21 KEEP_IN_NUMBERS = u"'.,"
22 ALLOWED_IN_NUMBERS = KEEP_IN_NUMBERS + u"_"
23
24 ROMAN = {
25 u"i": 1,
26 u"v": 5,
27 u"x": 10,
28 u"l": 50,
29 u"c": 100,
30 u"d": 500,
31 u"m": 1000,
32 u"\u2180": 1000,
33 u"\u2181": 5000,
34 u"\u2182": 10000,
35 u"\u2183": 100,
36 u"\u2184": 100,
37 u"\u2185": 6,
38 u"\u2186": 50,
39 u"\u2187": 50000,
40 u"\u2188": 100000,
41 }
42
43 def stripends(word):
44 """Strip punctuation and symbols from the ends of a string."""
45 while word and unicodedata.category(word[0])[0] in "PS":
46 word = word[1:]
47 while word and unicodedata.category(word[-1])[0] in "PS":
48 word = word[:-1]
49 return word
50
51 def sortemes(string, key=lambda s: s):
52 """Generate a list of sortemes for the string.
53
54 A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
55 sort information. This is larger than a word boundry but smaller
56 than a sentence boundry; roughly, a sorteme boundry occurs between
57 letters and numbers, between numbers and numbers if 'too much'
58 punctuation exists in between, between lines.
59
60 There is no formal specification for sortemes; the goal of this
61 function is to provide good output for Collator.sortemekey.
62
63 """
64
65 if not string:
66 return []
67
68 words = []
69 letters = []
70 digits = []
71 lappend = letters.append
72 dappend = digits.append
73 string = unicode(string)
74 categories = map(unicodedata.category, string)
75 previous = UNKNOWN
76 wappend = words.append
77 join = u"".join
78 i = 0
79
80 for uchar in string:
81 category = categories[i]
82
83 if letters and previous == LETTER and words:
84 word = stripends(words.pop()[1].strip()) + BREAKER
85 letters.insert(0, word)
86 previous = UNKNOWN
87
88 # Split at the first letter following a number or
89 # non-continuing character.
90 if category[0] == "L":
91 lappend(uchar)
92 if digits:
93 words.append((numeric(join(digits).strip()), u''))
94 del(digits[:])
95 previous = NUMBER
96
97 # Split at the first number following a non-number or
98 # non-continuing character.
99 elif category[0] == "N":
100 dappend(uchar)
101 if letters:
102 if unicodedata.category(letters[-1])[0] == "L":
103 lappend(HBREAKER)
104 wappend((INFINITY, stripends(join(letters))))
105 del(letters[:])
106 previous = LETTER
107
108 # Only certain punctuation allowed in numbers.
109 elif digits and uchar not in ALLOWED_IN_NUMBERS:
110 words.append((numeric(join(digits)), u''))
111 del(digits[:])
112 previous = NUMBER
113
114 # Split if we find a non-continuing character ("weird" ones).
115 elif category not in CONTINUE_ON:
116 if letters:
117 wappend(
118 (INFINITY,
119 stripends(join(letters).strip() + BREAKER)))
120 del(letters[:])
121 previous = LETTER
122 if digits:
123 words.append((numeric(join(digits)), u''))
124 del(digits[:])
125 previous = NUMBER
126
127 # Split if we find two pieces of punctuation in a row, even
128 # if we should otherwise continue.
129 elif i and categories[i - 1][0] == category[0] == "P":
130 if letters:
131 wappend((INFINITY, stripends(join(letters))))
132 del(letters[:])
133 previous = LETTER
134 if digits:
135 words.append((numeric(join(digits)), u''))
136 del(digits[:])
137 previous = NUMBER
138
139 else:
140 if digits:
141 dappend(uchar)
142 elif letters:
143 lappend(uchar)
144
145 i += 1
146
147 if letters and previous == LETTER and words:
148 word = stripends(words.pop()[1].strip()) + BREAKER
149 letters.insert(0, word)
150 previous = UNKNOWN
151
152 if letters:
153 wappend((INFINITY, stripends(join(letters))))
154 if digits:
155 words.append((numeric(join(digits)), u''))
156
157 return [(i, key(w)) for i, w in words]
158
159 def numeric(orig, invalid=INFINITY):
160 """Parse a number out of a string.
161
162 This function parses a unicode number out of the start of a
163 string. If a number cannot be found at the start, the 'invalid'
164 argument is returned.
165
166 """
167
168 if not orig:
169 return invalid
170
171 string = unicode(orig)
172 for uchar in string:
173 if uchar.isnumeric():
174 break
175 else:
176 return invalid
177
178 for char in string:
179 if u"\u2160" <= char <= u"\u2188":
180 return deroman(string)
181
182 mult = 1
183 while string[:1] == u"-" or string[:1] == u"+":
184 if string[:1] == u"-":
185 mult = -mult
186 string = string[1:]
187
188 if not string[:1].isnumeric():
189 return invalid
190
191 string = normalize_number(string)
192
193 def _numeric(string):
194 """Interpreter a number as base 10."""
195 total = 0
196 for uchar in string:
197 number = unicodedata.numeric(uchar)
198 if number >= 1 or number == 0:
199 total *= 10
200 total += number
201 return total
202
203 try:
204 whole, frac = string.split(".")
205 whole = _numeric(whole)
206 frac = _numeric(frac) / (10.0 ** len(frac))
207 return mult * (whole + frac)
208 except ValueError:
209 return mult * _numeric(string)
210
211 def normalize_number(string):
212 """Normalize punctuation in a number.
213
214 This function attempts to guess which characters in a number
215 represent grouping separators and which represent decimal
216 points. It returns a string that is valid to pass to Python's
217 float() routine (potentially, NaN, if nothing like a number is
218 found).
219
220 """
221
222 string = unicode(string)
223 string = filter(lambda u: u.isnumeric() or u in KEEP_IN_NUMBERS, string)
224 string = string.strip(KEEP_IN_NUMBERS)
225
226 commas = string.count(u",")
227 stops = string.count(u".")
228 quotes = string.count(u"'")
229
230 # If anything occurs more than once, it's a separator.
231 if commas > 1:
232 string = string.replace(u",", u"")
233 commas = 0
234 if stops > 1:
235 string = string.replace(u".", u"")
236 stops = 0
237 if quotes > 1:
238 string = string.replace(u"'", u"")
239 quotes = 0
240
241 def normalize_two(a, b, string):
242 """One of each - assume the first is grouping, second is point."""
243 a_idx = string.rindex(a)
244 b_idx = string.rindex(b)
245 if a_idx > b_idx:
246 string = string.replace(b, u"").replace(a, u".")
247 else:
248 string = string.replace(a, u"").replace(b, u".")
249 return string
250
251 if commas and stops and quotes:
252 # If all three, assume the middle is the decimal point.
253 # A,AAA.BB'CC
254 # A.AAA,BB'CC
255 # A,AAA'BB.CC
256 # A.AAA'BB,CC
257 # Not really valid, so do whatever we want...
258 # A'AAA.BB,CC
259 # A'AAA,BB.CC
260 comma_idx = string.index(u",")
261 stops_idx = string.index(u".")
262 quotes_idx = string.index(u"'")
263 if (comma_idx < stops_idx < quotes_idx
264 or quotes_idx < stops_idx < comma_idx):
265 string = string.replace(u",", u"").replace(u"'", u"")
266 elif (comma_idx < quotes_idx < stops_idx
267 or stops_idx < quotes_idx < comma_idx):
268 string = string.replace(
269 u",", u"").replace(
270 u".", u"").replace(
271 u"'", u".")
272 else:
273 string = string.replace(
274 u"'", u"").replace(
275 u".", u"").replace(
276 u",", u".")
277
278 elif stops and quotes:
279 string = normalize_two(u".", u"'", string)
280
281 elif commas and quotes:
282 string = normalize_two(u",", u"'", string)
283
284 elif commas and stops:
285 string = normalize_two(u",", u".", string)
286
287 elif commas:
288 if string[-4:-3] == u"," and len(string) <= 7:
289 # Single comma as a thousands separator.
290 string = string.replace(u",", u"")
291 else:
292 # Single comma, not thousands - probably a decimal point.
293 string = string.replace(u",", u".")
294
295 elif quotes:
296 # Single quote, probably MM'SS", equivalent to a decimal point.
297 string = string.replace(u"'", u".")
298
299 elif stops and string[-4:] == ".000":
300 # Single stop, but no decimal - probably grouping.
301 string = string.replace(u".", u"")
302
303 return string or "NaN"
304
305 def deroman(string):
306 """Turn a Roman numeral into an integer."""
307 string = unicodedata.normalize('NFKD', unicode(string)).lower()
308 previous = 0
309 building = 0
310 for char in reversed(string):
311 try:
312 value = ROMAN[char]
313 except KeyError:
314 continue
315 if value < previous:
316 building -= value
317 else:
318 building += value
319 previous = value
320 return building