fd6f71d852fe4c86f9260eb9a5d83b073dbe7e8d
[python-collate.git] / collate / strings.py
1 """String utility functions for collation."""
2
3 __all__ = ["sortemes", "numeric", "normalize_number"]
4
5 import unicodedata
6
7 CONTINUE_ON = frozenset([
8 "Ll", "Lm", "Lo", "Lt", "Lu",
9 "Mc", "Me", "Mn",
10 "Nd", "Nl", "No",
11 "Po",
12 "Zs",
13 ])
14
15 UNKNOWN, LETTER, NUMBER = range(3)
16
17 BREAKER = u"\u2028" # Line break character
18 HBREAKER = u"\u2029" # Paragraph break character
19 INFINITY = float('inf')
20
21 KEEP_IN_NUMBERS = u"'.,"
22 ALLOWED_IN_NUMBERS = KEEP_IN_NUMBERS + u"_"
23
24 ROMAN = {
25 u"i": 1,
26 u"v": 5,
27 u"x": 10,
28 u"l": 50,
29 u"c": 100,
30 u"d": 500,
31 u"m": 1000,
32 u"\u2180": 1000,
33 u"\u2181": 5000,
34 u"\u2182": 10000,
35 u"\u2183": 100,
36 u"\u2184": 100,
37 u"\u2185": 6,
38 u"\u2186": 50,
39 u"\u2187": 50000,
40 u"\u2188": 100000,
41 }
42
43 def stripends(word):
44 """Strip punctuation and symbols from the ends of a string."""
45 while word and unicodedata.category(word[0])[0] in "PS":
46 word = word[1:]
47 while word and unicodedata.category(word[-1])[0] in "PS":
48 word = word[:-1]
49 return word
50
51 def sortemes(string, key=lambda s: s):
52 """Generate a list of sortemes for the string.
53
54 A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
55 sort information. This is larger than a word boundry but smaller
56 than a sentence boundry; roughly, a sorteme boundry occurs between
57 letters and numbers, between numbers and numbers if 'too much'
58 punctuation exists in between, between lines.
59
60 There is no formal specification for sortemes; the goal of this
61 function is to provide good output for Collator.sortemekey.
62
63 """
64
65 words = []
66 letters = []
67 digits = []
68 if not string:
69 return words
70 string = unicode(string)
71 categories = map(unicodedata.category, string)
72 previous = UNKNOWN
73
74 def aletters(letters):
75 """Add a group of letters to the word list."""
76 words.append((INFINITY, stripends(letters)))
77 def adigits(digits):
78 """Add a group of digits to the word list."""
79 words.append((numeric(digits), u''))
80
81 # TODO(jfw): This kind of evolved over time, there's probably a much
82 # faster / more concise way to express it now.
83 for i, (uchar, category) in enumerate(zip(string, categories)):
84
85 if letters and previous == LETTER and words:
86 word = stripends(words.pop()[1].strip()) + BREAKER
87 letters.insert(0, word)
88 previous = UNKNOWN
89
90 # Split at the first letter following a number or
91 # non-continuing character.
92 if category[0] == "L":
93 letters.append(uchar)
94 if digits:
95 adigits(u"".join(digits).strip())
96 digits = []
97 previous = NUMBER
98
99 # Split at the first number following a non-number or
100 # non-continuing character.
101 elif category[0] == "N":
102 digits.append(uchar)
103 if letters:
104 if unicodedata.category(letters[-1])[0] == "L":
105 letters.append(HBREAKER)
106 aletters(u"".join(letters))
107 letters = []
108 previous = LETTER
109
110 # Only certain punctuation allowed in numbers.
111 elif digits and uchar not in ALLOWED_IN_NUMBERS:
112 adigits(u"".join(digits))
113 digits = []
114 previous = NUMBER
115
116 # Split if we find a non-continuing character ("weird" ones).
117 elif letters and category not in CONTINUE_ON:
118 if letters:
119 aletters(u"".join(letters).strip() + BREAKER)
120 letters = []
121 previous = LETTER
122 if digits:
123 adigits(u"".join(digits).strip())
124 digits = []
125 previous = NUMBER
126
127 # Split if we find two pieces of punctuation in a row, even
128 # if we should otherwise continue.
129 elif i and categories[i-1][0] in "P" and category[0] in "P":
130 if letters:
131 aletters(u"".join(letters))
132 letters = []
133 previous = LETTER
134 if digits:
135 adigits(u"".join(digits))
136 digits = []
137 previous = NUMBER
138
139 else:
140 if digits:
141 digits.append(uchar)
142 elif letters:
143 letters.append(uchar)
144
145 if letters and previous == LETTER and words:
146 word = stripends(words.pop()[1].strip()) + BREAKER
147 letters.insert(0, word)
148 previous = UNKNOWN
149
150 if letters:
151 aletters(u"".join(letters))
152 if digits:
153 adigits(u"".join(digits))
154
155 return [(i, key(w) if w else u'') for i, w in words]
156
157 def numeric(orig, invalid=INFINITY):
158 """Parse a number out of a string.
159
160 This function parses a unicode number out of the start of a
161 string. If a number cannot be found at the start, the 'invalid'
162 argument is returned.
163
164 """
165
166 if not orig:
167 return invalid
168
169 string = unicode(orig)
170 for uchar in string:
171 if uchar.isnumeric():
172 break
173 else:
174 return invalid
175
176 for char in string:
177 if u"\u2160" <= char <= u"\u2188":
178 return deroman(string)
179
180 mult = 1
181 while string[:1] == u"-" or string[:1] == u"+":
182 if string[:1] == u"-":
183 mult = -mult
184 string = string[1:]
185
186 if not string[:1].isnumeric():
187 return invalid
188
189 string = normalize_number(string)
190
191 def _numeric(string):
192 """Interpreter a number as base 10."""
193 total = 0
194 for uchar in string:
195 number = unicodedata.numeric(uchar)
196 if number >= 1 or number == 0:
197 total *= 10
198 total += number
199 return total
200
201 try:
202 whole, frac = string.split(".")
203 whole = _numeric(whole)
204 frac = _numeric(frac) / (10.0 ** len(frac))
205 return mult * (whole + frac)
206 except ValueError:
207 return mult * _numeric(string)
208
209 def normalize_number(string):
210 """Normalize punctuation in a number.
211
212 This function attempts to guess which characters in a number
213 represent grouping separators and which represent decimal
214 points. It returns a string that is valid to pass to Python's
215 float() routine (potentially, NaN, if nothing like a number is
216 found).
217
218 """
219
220 string = unicode(string)
221 string = filter(lambda u: u.isnumeric() or u in KEEP_IN_NUMBERS, string)
222 string = string.strip(KEEP_IN_NUMBERS)
223
224 commas = string.count(u",")
225 stops = string.count(u".")
226 quotes = string.count(u"'")
227
228 # If anything occurs more than once, it's a separator.
229 if commas > 1:
230 string = string.replace(u",", u"")
231 commas = 0
232 if stops > 1:
233 string = string.replace(u".", u"")
234 stops = 0
235 if quotes > 1:
236 string = string.replace(u"'", u"")
237 quotes = 0
238
239 def normalize_two(a, b, string):
240 """One of each - assume the first is grouping, second is point."""
241 a_idx = string.rindex(a)
242 b_idx = string.rindex(b)
243 if a_idx > b_idx:
244 string = string.replace(b, u"").replace(a, u".")
245 else:
246 string = string.replace(a, u"").replace(b, u".")
247 return string
248
249 if commas and stops and quotes:
250 # If all three, assume the middle is the decimal point.
251 # A,AAA.BB'CC
252 # A.AAA,BB'CC
253 # A,AAA'BB.CC
254 # A.AAA'BB,CC
255 # Not really valid, so do whatever we want...
256 # A'AAA.BB,CC
257 # A'AAA,BB.CC
258 comma_idx = string.index(u",")
259 stops_idx = string.index(u".")
260 quotes_idx = string.index(u"'")
261 if (comma_idx < stops_idx < quotes_idx
262 or quotes_idx < stops_idx < comma_idx):
263 string = string.replace(u",", u"").replace(u"'", u"")
264 elif (comma_idx < quotes_idx < stops_idx
265 or stops_idx < quotes_idx < comma_idx):
266 string = string.replace(
267 u",", u"").replace(
268 u".", u"").replace(
269 u"'", u".")
270 else:
271 string = string.replace(
272 u"'", u"").replace(
273 u".", u"").replace(
274 u",", u".")
275
276 elif stops and quotes:
277 string = normalize_two(u".", u"'", string)
278
279 elif commas and quotes:
280 string = normalize_two(u",", u"'", string)
281
282 elif commas and stops:
283 string = normalize_two(u",", u".", string)
284
285 elif commas:
286 if string[-4:-3] == u"," and len(string) <= 7:
287 # Single comma as a thousands separator.
288 string = string.replace(u",", u"")
289 else:
290 # Single comma, not thousands - probably a decimal point.
291 string = string.replace(u",", u".")
292
293 elif quotes:
294 # Single quote, probably MM'SS", equivalent to a decimal point.
295 string = string.replace(u"'", u".")
296
297 elif stops and string[-4:] == ".000":
298 # Single stop, but no decimal - probably grouping.
299 string = string.replace(u".", u"")
300
301 return string or "NaN"
302
303 def deroman(string):
304 """Turn a Roman numeral into an integer."""
305 string = unicodedata.normalize('NFKD', unicode(string)).lower()
306 previous = 0
307 building = 0
308 for char in reversed(string):
309 try:
310 value = ROMAN[char]
311 except KeyError:
312 continue
313 if value < previous:
314 building -= value
315 else:
316 building += value
317 previous = value
318 return building