8d6af993cd72d5f0fcb4fc04eb6e1953f72904c9
[python-collate.git] / collate / strings.py
1 """String utility functions for collation."""
2
3 __all__ = ["sortemes", "numeric", "normalize_number"]
4
5 import unicodedata
6
7 CONTINUE_ON = frozenset([
8 "Ll", "Lm", "Lo", "Lt", "Lu",
9 "Mc", "Me", "Mn",
10 "Nd", "Nl", "No",
11 "Po",
12 "Zs",
13 ])
14
15 UNKNOWN, LETTER, NUMBER = range(3)
16
17 BREAKER = u"\u2029" # Paragraph break character
18 INFINITY = float('inf')
19
20 KEEP_IN_NUMBERS = u"'.,"
21 ALLOWED_IN_NUMBERS = KEEP_IN_NUMBERS + u"_"
22
23 def stripends(word):
24 """Strip punctuation and symbols from the ends of a string."""
25 while word and unicodedata.category(word[0])[0] in "PS":
26 word = word[1:]
27 while word and unicodedata.category(word[-1])[0] in "PS":
28 word = word[:-1]
29 return word
30
31 def sortemes(string, key=lambda s: s):
32 """Generate a list of sortemes for the string.
33
34 A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
35 sort information. This is larger than a word boundry but smaller
36 than a sentence boundry; roughly, a sorteme boundry occurs between
37 letters and numbers, between numbers and numbers if 'too much'
38 punctuation exists in between, between lines.
39
40 There is no formal specification for sortemes; the goal of this
41 function is to provide good output for Collator.sortemekey.
42
43 """
44
45 words = []
46 letters = []
47 digits = []
48 if not string:
49 return words
50 string = unicode(string)
51 categories = map(unicodedata.category, string)
52 previous = UNKNOWN
53
54 def aletters(letters):
55 """Add a group of letters to the word list."""
56 words.append((INFINITY, stripends(letters)))
57 def adigits(digits):
58 """Add a group of digits to the word list."""
59 words.append((numeric(digits), u''))
60
61 # TODO(jfw): This kind of evolved over time, there's probably a much
62 # faster / more concise way to express it now.
63 for i, (uchar, category) in enumerate(zip(string, categories)):
64
65 if letters and previous == LETTER and words:
66 word = stripends(words.pop()[1].strip()) + BREAKER
67 letters.insert(0, word)
68 previous = UNKNOWN
69
70 # Split at the first letter following a number or
71 # non-continuing character.
72 if category[0] == "L":
73 letters.append(uchar)
74 if digits:
75 adigits(u"".join(digits).strip())
76 digits = []
77 previous = NUMBER
78
79 # Split at the first number following a non-number or
80 # non-continuing character.
81 elif category[0] == "N":
82 digits.append(uchar)
83 if letters:
84 aletters(u"".join(letters))
85 letters = []
86 previous = LETTER
87
88 # Only certain punctuation allowed in numbers.
89 elif digits and uchar not in ALLOWED_IN_NUMBERS:
90 adigits(u"".join(digits))
91 digits = []
92 previous = NUMBER
93
94 # Split if we find a non-continuing character ("weird" ones).
95 elif letters and category not in CONTINUE_ON:
96 if letters:
97 aletters(u"".join(letters).strip() + BREAKER)
98 letters = []
99 previous = LETTER
100 if digits:
101 adigits(u"".join(digits).strip())
102 digits = []
103 previous = NUMBER
104
105 # Split if we find two pieces of punctuation in a row, even
106 # if we should otherwise continue.
107 elif i and categories[i-1][0] in "P" and category[0] in "P":
108 if letters:
109 aletters(u"".join(letters))
110 letters = []
111 previous = LETTER
112 if digits:
113 adigits(u"".join(digits))
114 digits = []
115 previous = NUMBER
116
117 else:
118 if digits:
119 digits.append(uchar)
120 elif letters:
121 letters.append(uchar)
122
123 if letters and previous == LETTER and words:
124 word = stripends(words.pop()[1].strip()) + BREAKER
125 letters.insert(0, word)
126 previous = UNKNOWN
127
128 if letters:
129 aletters(u"".join(letters))
130 if digits:
131 adigits(u"".join(digits))
132
133 return [(i, key(w) if w else u'') for i, w in words]
134
135 def numeric(orig, invalid=INFINITY):
136 """Parse a number out of a string.
137
138 This function parses a unicode number out of the start of a
139 string. If a number cannot be found at the start, the 'invalid'
140 argument is returned.
141
142 """
143
144 if not orig:
145 return invalid
146
147 string = unicode(orig)
148 for uchar in string:
149 if uchar.isnumeric():
150 break
151 else:
152 return invalid
153
154 mult = 1
155 while string[:1] == u"-" or string[:1] == u"+":
156 if string[:1] == u"-":
157 mult = -mult
158 string = string[1:]
159
160 if not string[:1].isnumeric():
161 return invalid
162
163 string = normalize_number(string)
164
165 def _numeric(string):
166 """Interpreter a number as base 10."""
167 total = 0
168 for uchar in string:
169 number = unicodedata.numeric(uchar)
170 if number >= 1 or number == 0:
171 total *= 10
172 total += number
173 return total
174
175 try:
176 whole, frac = string.split(".")
177 whole = _numeric(whole)
178 frac = _numeric(frac) / (10.0 ** len(frac))
179 return mult * (whole + frac)
180 except ValueError:
181 return mult * _numeric(string)
182
183 def normalize_number(string):
184 """Normalize punctuation in a number.
185
186 This function attempts to guess which characters in a number
187 represent grouping separators and which represent decimal
188 points. It returns a string that is valid to pass to Python's
189 float() routine (potentially, NaN, if nothing like a number is
190 found).
191
192 """
193
194 string = unicode(string)
195 string = filter(lambda u: u.isnumeric() or u in KEEP_IN_NUMBERS, string)
196 string = string.strip(KEEP_IN_NUMBERS)
197
198 commas = string.count(u",")
199 stops = string.count(u".")
200 quotes = string.count(u"'")
201
202 # If anything occurs more than once, it's a separator.
203 if commas > 1:
204 string = string.replace(u",", u"")
205 commas = 0
206 if stops > 1:
207 string = string.replace(u".", u"")
208 stops = 0
209 if quotes > 1:
210 string = string.replace(u"'", u"")
211 quotes = 0
212
213 def normalize_two(a, b, string):
214 """One of each - assume the first is grouping, second is point."""
215 a_idx = string.rindex(a)
216 b_idx = string.rindex(b)
217 if a_idx > b_idx:
218 string = string.replace(b, u"").replace(a, u".")
219 else:
220 string = string.replace(a, u"").replace(b, u".")
221 return string
222
223 if commas and stops and quotes:
224 # If all three, assume the middle is the decimal point.
225 # A,AAA.BB'CC
226 # A.AAA,BB'CC
227 # A,AAA'BB.CC
228 # A.AAA'BB,CC
229 # Not really valid, so do whatever we want...
230 # A'AAA.BB,CC
231 # A'AAA,BB.CC
232 comma_idx = string.index(u",")
233 stops_idx = string.index(u".")
234 quotes_idx = string.index(u"'")
235 if (comma_idx < stops_idx < quotes_idx
236 or quotes_idx < stops_idx < comma_idx):
237 string = string.replace(u",", u"").replace(u"'", u"")
238 elif (comma_idx < quotes_idx < stops_idx
239 or stops_idx < quotes_idx < comma_idx):
240 string = string.replace(
241 u",", u"").replace(
242 u".", u"").replace(
243 u"'", u".")
244 else:
245 string = string.replace(
246 u"'", u"").replace(
247 u".", u"").replace(
248 u",", u".")
249
250 elif stops and quotes:
251 string = normalize_two(u".", u"'", string)
252
253 elif commas and quotes:
254 string = normalize_two(u",", u"'", string)
255
256 elif commas and stops:
257 string = normalize_two(u",", u".", string)
258
259 elif commas:
260 if string[-4:-3] == u"," and len(string) <= 7:
261 # Single comma as a thousands separator.
262 string = string.replace(u",", u"")
263 else:
264 # Single comma, not thousands - probably a decimal point.
265 string = string.replace(u",", u".")
266
267 elif quotes:
268 # Single quote, probably MM'SS", equivalent to a decimal point.
269 string = string.replace(u"'", u".")
270
271 elif stops and string[-4:] == ".000":
272 # Single stop, but no decimal - probably grouping.
273 string = string.replace(u".", u"")
274
275 return string or "NaN"