57172461b91eac0585395483e6e3ba658d7e4fc2
[python-collate.git] / collate / strings.py
1 import unicodedata
2
3 CONTINUE_ON = frozenset([
4 "Ll", "Lm", "Lo", "Lt", "Lu",
5 "Mc", "Me", "Mn",
6 "Nd", "Nl", "No",
7 "Po",
8 "Zs",
9 ])
10
11 UNKNOWN, LETTER, NUMBER = range(3)
12
13 BREAKER = u"\u2029" # Paragraph break character
14
15 def sortemes(string):
16 """Generate a list of sortemes for the string.
17
18 A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
19 sort information. This is larger than a word boundry but smaller
20 than a sentence boundry; roughly, a sorteme boundry occurs between
21 letters and numbers, between numbers and numbrs if 'too much'
22 punctuation exists in between, between lines.
23
24 There is no formal specification for sortemes; the goal of this
25 function is to provide good output for Collator.sortemekey.
26 """
27
28 words = []
29 letters = []
30 digits = []
31 if not string:
32 return words
33 string = unicode(string)
34 categories = map(unicodedata.category, string)
35 previous = UNKNOWN
36 types = []
37
38 def stripends(word):
39 while word and unicodedata.category(word[0])[0] in "PS":
40 word = word[1:]
41 while word and unicodedata.category(word[-1])[0] in "PS":
42 word = word[:-1]
43 return word
44
45 # TODO(jfw): This kind of evolved over time, there's probably a much
46 # faster / more concise way to express it now.
47 for i, (c, category) in enumerate(zip(string, categories)):
48
49 if letters and previous == LETTER and words:
50 word = stripends(words.pop().strip())
51 letters = list(stripends(word).strip() + BREAKER) + letters
52 previous = UNKNOWN
53
54 # Split at the first letter following a number or
55 # non-continuing character.
56 if category[0] == "L":
57 letters.append(c)
58 if digits:
59 words.append(u"".join(digits).strip())
60 previous = NUMBER
61 digits = []
62
63 # Split at the first number following a non-number or
64 # non-continuing character.
65 elif category[0] == "N":
66 digits.append(c)
67 if letters:
68 words.append(u"".join(letters))
69 previous = LETTER
70 letters = []
71
72 # Only certain punctuation allowed in numbers.
73 elif digits and c not in "',._":
74 words.append(u"".join(digits))
75 previous = NUMBER
76 digits = []
77
78 # Split if we find a non-continuing character ("weird" ones).
79 elif letters and category not in CONTINUE_ON:
80 if letters:
81 words.append(u"".join(letters).strip() + BREAKER)
82 previous = LETTER
83 letters = []
84 if digits:
85 words.append(u"".join(digits).strip() + BREAKER)
86 previous = NUMBER
87 digits = []
88
89 # Split if we find two pieces of punctuation in a row, even
90 # if we should otherwise continue.
91 elif i and categories[i-1][0] in "P" and category[0] in "P":
92 if letters:
93 words.append(u"".join(letters))
94 previous = LETTER
95 letters = []
96 if digits:
97 words.append(u"".join(digits))
98 previous = NUMBER
99 digits = []
100
101 else:
102 if digits:
103 digits.append(c)
104 elif letters:
105 letters.append(c)
106
107 if letters and previous == LETTER and words:
108 word = stripends(words.pop().strip())
109 letters = list(stripends(word).strip() + BREAKER) + letters
110 previous = UNKNOWN
111
112 if letters:
113 words.append(u"".join(letters))
114 letters = []
115 if digits:
116 words.append(u"".join(digits))
117 digits = []
118
119 words = map(stripends, words)
120 return words
121
122 def numeric(orig, invalid=float('inf')):
123 if not orig:
124 return (invalid, '')
125
126 string = unicode(orig)
127 for c in string:
128 if c.isnumeric():
129 break
130 else:
131 return (invalid, orig)
132
133 mult = 1
134 while string[:1] == u"-" or string[:1] == u"+":
135 if string[:1] == u"-":
136 mult = -mult
137 string = string[1:]
138
139 if not string[:1].isnumeric():
140 return (invalid, orig)
141
142 string = normalize_punc(string)
143
144 # Early out if possible.
145 try:
146 return (float(string) * mult, orig)
147 except ValueError:
148 pass
149
150 # Otherwise we need to do this the hard way.
151 def _numeric(string):
152 total = 0
153 for c in string:
154 v = unicodedata.numeric(c)
155 if v >= 1 or v == 0:
156 total *= 10
157 total += v
158 return total
159
160 try:
161 whole, frac = string.split(".")
162 whole = _numeric(whole)
163 frac = _numeric(frac) / (10.0 ** len(frac))
164 return (mult * (whole + frac), orig)
165 except ValueError:
166 return (mult * _numeric(string), orig)
167
168 def normalize_punc(string):
169 string = unicode(string.strip(u",.'"))
170 string = filter(lambda u: u.isnumeric() or u in u",.'", string)
171 commas = string.count(u",")
172 stops = string.count(u".")
173 quotes = string.count(u"'")
174
175 # If anything occurs more than once, it's a separator.
176 if commas > 1:
177 string = string.replace(u",", u"")
178 commas = 0
179 if stops > 1:
180 string = string.replace(u".", u"")
181 stops = 0
182 if quotes > 1:
183 string = string.replace(u"'", u"")
184 quotes = 0
185
186 def normalize_two(a, b, string):
187 # One of each - assume the first is grouping, second is point.
188 a_idx = string.rindex(a)
189 b_idx = string.rindex(b)
190 if a_idx > b_idx:
191 string = string.replace(b, u"").replace(a, u".")
192 else:
193 string = string.replace(a, u"").replace(b, u".")
194 return string
195
196 if commas and stops and quotes:
197 # If all three, assume the middle is the decimal point.
198 # A,AAA.BB'CC
199 # A.AAA,BB'CC
200 # A,AAA'BB.CC
201 # A.AAA'BB,CC
202 # Not really valid, so do whatever we want...
203 # A'AAA.BB,CC
204 # A'AAA,BB.CC
205 comma_idx = string.index(u",")
206 stops_idx = string.index(u".")
207 quotes_idx = string.index(u"'")
208 if (comma_idx < stops_idx < quotes_idx
209 or quotes_idx < stops_idx < comma_idx):
210 string = string.replace(u",", u"").replace(u"'", u"")
211 elif (comma_idx < quotes_idx < stops_idx
212 or stops_idx < quotes_idx < comma_idx):
213 string = string.replace(
214 u",", u"").replace(
215 u".", u"").replace(
216 u"'", u".")
217 else:
218 string = string.replace(
219 u"'", u"").replace(
220 u".", u"").replace(
221 u",", u".")
222
223 elif stops and quotes:
224 string = normalize_two(u".", u"'", string)
225
226 elif commas and quotes:
227 string = normalize_two(u",", u"'", string)
228
229 elif commas and stops:
230 string = normalize_two(u",", u".", string)
231
232 elif commas:
233 if string[-4:-3] == u"," and len(string) <= 7:
234 # Single comma as a thousands separator.
235 string = string.replace(u",", u"")
236 else:
237 # Single comma, not thousands - probably a decimal point.
238 string = string.replace(u",", u".")
239
240 elif quotes:
241 # Single quote, probably MM'SS", equivalent to a decimal point.
242 string = string.replace(u"'", u".")
243
244 elif stops and string[-4:] == ".000":
245 # Single stop, but no decimal - probably grouping.
246 string = string.replace(u".", u"")
247
248 return string