strings.sortemes: Use a line break to separate letters and non-continuing-characters...
[python-collate.git] / collate / strings.py
1 """String utility functions for collation."""
2
3 __all__ = ["sortemes", "numeric", "normalize_number"]
4
5 import unicodedata
6
7 CONTINUE_ON = frozenset([
8 "Ll", "Lm", "Lo", "Lt", "Lu",
9 "Mc", "Me", "Mn",
10 "Nd", "Nl", "No",
11 "Po",
12 "Zs",
13 ])
14
15 UNKNOWN, LETTER, NUMBER = range(3)
16
17 BREAKER = u"\u2028" # Line break character
18 HBREAKER = u"\u2029" # Paragraph break character
19 INFINITY = float('inf')
20
21 KEEP_IN_NUMBERS = u"'.,"
22 ALLOWED_IN_NUMBERS = KEEP_IN_NUMBERS + u"_"
23
24 def stripends(word):
25 """Strip punctuation and symbols from the ends of a string."""
26 while word and unicodedata.category(word[0])[0] in "PS":
27 word = word[1:]
28 while word and unicodedata.category(word[-1])[0] in "PS":
29 word = word[:-1]
30 return word
31
32 def sortemes(string, key=lambda s: s):
33 """Generate a list of sortemes for the string.
34
35 A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
36 sort information. This is larger than a word boundry but smaller
37 than a sentence boundry; roughly, a sorteme boundry occurs between
38 letters and numbers, between numbers and numbers if 'too much'
39 punctuation exists in between, between lines.
40
41 There is no formal specification for sortemes; the goal of this
42 function is to provide good output for Collator.sortemekey.
43
44 """
45
46 words = []
47 letters = []
48 digits = []
49 if not string:
50 return words
51 string = unicode(string)
52 categories = map(unicodedata.category, string)
53 previous = UNKNOWN
54
55 def aletters(letters):
56 """Add a group of letters to the word list."""
57 words.append((INFINITY, stripends(letters)))
58 def adigits(digits):
59 """Add a group of digits to the word list."""
60 words.append((numeric(digits), u''))
61
62 # TODO(jfw): This kind of evolved over time, there's probably a much
63 # faster / more concise way to express it now.
64 for i, (uchar, category) in enumerate(zip(string, categories)):
65
66 if letters and previous == LETTER and words:
67 word = stripends(words.pop()[1].strip()) + BREAKER
68 letters.insert(0, word)
69 previous = UNKNOWN
70
71 # Split at the first letter following a number or
72 # non-continuing character.
73 if category[0] == "L":
74 letters.append(uchar)
75 if digits:
76 adigits(u"".join(digits).strip())
77 digits = []
78 previous = NUMBER
79
80 # Split at the first number following a non-number or
81 # non-continuing character.
82 elif category[0] == "N":
83 digits.append(uchar)
84 if letters:
85 if unicodedata.category(letters[-1])[0] == "L":
86 letters.append(HBREAKER)
87 aletters(u"".join(letters))
88 letters = []
89 previous = LETTER
90
91 # Only certain punctuation allowed in numbers.
92 elif digits and uchar not in ALLOWED_IN_NUMBERS:
93 adigits(u"".join(digits))
94 digits = []
95 previous = NUMBER
96
97 # Split if we find a non-continuing character ("weird" ones).
98 elif letters and category not in CONTINUE_ON:
99 if letters:
100 aletters(u"".join(letters).strip() + BREAKER)
101 letters = []
102 previous = LETTER
103 if digits:
104 adigits(u"".join(digits).strip())
105 digits = []
106 previous = NUMBER
107
108 # Split if we find two pieces of punctuation in a row, even
109 # if we should otherwise continue.
110 elif i and categories[i-1][0] in "P" and category[0] in "P":
111 if letters:
112 aletters(u"".join(letters))
113 letters = []
114 previous = LETTER
115 if digits:
116 adigits(u"".join(digits))
117 digits = []
118 previous = NUMBER
119
120 else:
121 if digits:
122 digits.append(uchar)
123 elif letters:
124 letters.append(uchar)
125
126 if letters and previous == LETTER and words:
127 word = stripends(words.pop()[1].strip()) + BREAKER
128 letters.insert(0, word)
129 previous = UNKNOWN
130
131 if letters:
132 aletters(u"".join(letters))
133 if digits:
134 adigits(u"".join(digits))
135
136 return [(i, key(w) if w else u'') for i, w in words]
137
138 def numeric(orig, invalid=INFINITY):
139 """Parse a number out of a string.
140
141 This function parses a unicode number out of the start of a
142 string. If a number cannot be found at the start, the 'invalid'
143 argument is returned.
144
145 """
146
147 if not orig:
148 return invalid
149
150 string = unicode(orig)
151 for uchar in string:
152 if uchar.isnumeric():
153 break
154 else:
155 return invalid
156
157 mult = 1
158 while string[:1] == u"-" or string[:1] == u"+":
159 if string[:1] == u"-":
160 mult = -mult
161 string = string[1:]
162
163 if not string[:1].isnumeric():
164 return invalid
165
166 string = normalize_number(string)
167
168 def _numeric(string):
169 """Interpreter a number as base 10."""
170 total = 0
171 for uchar in string:
172 number = unicodedata.numeric(uchar)
173 if number >= 1 or number == 0:
174 total *= 10
175 total += number
176 return total
177
178 try:
179 whole, frac = string.split(".")
180 whole = _numeric(whole)
181 frac = _numeric(frac) / (10.0 ** len(frac))
182 return mult * (whole + frac)
183 except ValueError:
184 return mult * _numeric(string)
185
186 def normalize_number(string):
187 """Normalize punctuation in a number.
188
189 This function attempts to guess which characters in a number
190 represent grouping separators and which represent decimal
191 points. It returns a string that is valid to pass to Python's
192 float() routine (potentially, NaN, if nothing like a number is
193 found).
194
195 """
196
197 string = unicode(string)
198 string = filter(lambda u: u.isnumeric() or u in KEEP_IN_NUMBERS, string)
199 string = string.strip(KEEP_IN_NUMBERS)
200
201 commas = string.count(u",")
202 stops = string.count(u".")
203 quotes = string.count(u"'")
204
205 # If anything occurs more than once, it's a separator.
206 if commas > 1:
207 string = string.replace(u",", u"")
208 commas = 0
209 if stops > 1:
210 string = string.replace(u".", u"")
211 stops = 0
212 if quotes > 1:
213 string = string.replace(u"'", u"")
214 quotes = 0
215
216 def normalize_two(a, b, string):
217 """One of each - assume the first is grouping, second is point."""
218 a_idx = string.rindex(a)
219 b_idx = string.rindex(b)
220 if a_idx > b_idx:
221 string = string.replace(b, u"").replace(a, u".")
222 else:
223 string = string.replace(a, u"").replace(b, u".")
224 return string
225
226 if commas and stops and quotes:
227 # If all three, assume the middle is the decimal point.
228 # A,AAA.BB'CC
229 # A.AAA,BB'CC
230 # A,AAA'BB.CC
231 # A.AAA'BB,CC
232 # Not really valid, so do whatever we want...
233 # A'AAA.BB,CC
234 # A'AAA,BB.CC
235 comma_idx = string.index(u",")
236 stops_idx = string.index(u".")
237 quotes_idx = string.index(u"'")
238 if (comma_idx < stops_idx < quotes_idx
239 or quotes_idx < stops_idx < comma_idx):
240 string = string.replace(u",", u"").replace(u"'", u"")
241 elif (comma_idx < quotes_idx < stops_idx
242 or stops_idx < quotes_idx < comma_idx):
243 string = string.replace(
244 u",", u"").replace(
245 u".", u"").replace(
246 u"'", u".")
247 else:
248 string = string.replace(
249 u"'", u"").replace(
250 u".", u"").replace(
251 u",", u".")
252
253 elif stops and quotes:
254 string = normalize_two(u".", u"'", string)
255
256 elif commas and quotes:
257 string = normalize_two(u",", u"'", string)
258
259 elif commas and stops:
260 string = normalize_two(u",", u".", string)
261
262 elif commas:
263 if string[-4:-3] == u"," and len(string) <= 7:
264 # Single comma as a thousands separator.
265 string = string.replace(u",", u"")
266 else:
267 # Single comma, not thousands - probably a decimal point.
268 string = string.replace(u",", u".")
269
270 elif quotes:
271 # Single quote, probably MM'SS", equivalent to a decimal point.
272 string = string.replace(u"'", u".")
273
274 elif stops and string[-4:] == ".000":
275 # Single stop, but no decimal - probably grouping.
276 string = string.replace(u".", u"")
277
278 return string or "NaN"