1dffdac30168c09d9353383ecb902933b3057fd4
[python-collate.git] / collate / strings.py
1 import unicodedata
2
3 CONTINUE_ON = frozenset([
4 "Ll", "Lm", "Lo", "Lt", "Lu",
5 "Mc", "Me", "Mn",
6 "Nd", "Nl", "No",
7 "Po",
8 "Zs",
9 ])
10
11 UNKNOWN, LETTER, NUMBER = range(3)
12
13 BREAKER = u"\u2029" # Paragraph break character
14 INFINITY = float('inf')
15
16 def sortemes(string, key=lambda s: s):
17 """Generate a list of sortemes for the string.
18
19 A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
20 sort information. This is larger than a word boundry but smaller
21 than a sentence boundry; roughly, a sorteme boundry occurs between
22 letters and numbers, between numbers and numbrs if 'too much'
23 punctuation exists in between, between lines.
24
25 There is no formal specification for sortemes; the goal of this
26 function is to provide good output for Collator.sortemekey.
27 """
28
29 words = []
30 letters = []
31 digits = []
32 if not string:
33 return words
34 string = unicode(string)
35 categories = map(unicodedata.category, string)
36 previous = UNKNOWN
37 types = []
38
39 def stripends(word):
40 while word and unicodedata.category(word[0])[0] in "PS":
41 word = word[1:]
42 while word and unicodedata.category(word[-1])[0] in "PS":
43 word = word[:-1]
44 return word
45
46 def aletters(letters):
47 words.append((INFINITY, stripends(letters)))
48 def adigits(digits):
49 words.append((numeric(digits), u''))
50
51 # TODO(jfw): This kind of evolved over time, there's probably a much
52 # faster / more concise way to express it now.
53 for i, (c, category) in enumerate(zip(string, categories)):
54
55 if letters and previous == LETTER and words:
56 word = stripends(words.pop()[1].strip()) + BREAKER
57 letters.insert(0, word)
58 previous = UNKNOWN
59
60 # Split at the first letter following a number or
61 # non-continuing character.
62 if category[0] == "L":
63 letters.append(c)
64 if digits:
65 adigits(u"".join(digits).strip())
66 digits = []
67 previous = NUMBER
68
69 # Split at the first number following a non-number or
70 # non-continuing character.
71 elif category[0] == "N":
72 digits.append(c)
73 if letters:
74 aletters(u"".join(letters))
75 letters = []
76 previous = LETTER
77
78 # Only certain punctuation allowed in numbers.
79 elif digits and c not in "',._":
80 adigits(u"".join(digits))
81 digits = []
82 previous = NUMBER
83
84 # Split if we find a non-continuing character ("weird" ones).
85 elif letters and category not in CONTINUE_ON:
86 if letters:
87 aletters(u"".join(letters).strip() + BREAKER)
88 letters = []
89 previous = LETTER
90 if digits:
91 adigits(u"".join(digits).strip())
92 digits = []
93 previous = NUMBER
94
95 # Split if we find two pieces of punctuation in a row, even
96 # if we should otherwise continue.
97 elif i and categories[i-1][0] in "P" and category[0] in "P":
98 if letters:
99 aletters(u"".join(letters))
100 letters = []
101 previous = LETTER
102 if digits:
103 adigits(u"".join(digits))
104 digits = []
105 previous = NUMBER
106
107 else:
108 if digits:
109 digits.append(c)
110 elif letters:
111 letters.append(c)
112
113 if letters and previous == LETTER and words:
114 word = stripends(words.pop()[1].strip()) + BREAKER
115 letters.insert(0, word)
116 previous = UNKNOWN
117
118 if letters:
119 aletters(u"".join(letters))
120 if digits:
121 adigits(u"".join(digits))
122
123 return [(i, key(w) if w else u'') for i, w in words]
124
125 def numeric(orig, invalid=float('inf')):
126 if not orig:
127 return invalid
128
129 string = unicode(orig)
130 for c in string:
131 if c.isnumeric():
132 break
133 else:
134 return invalid
135
136 mult = 1
137 while string[:1] == u"-" or string[:1] == u"+":
138 if string[:1] == u"-":
139 mult = -mult
140 string = string[1:]
141
142 if not string[:1].isnumeric():
143 return (invalid, orig)
144
145 string = normalize_punc(string)
146
147 # Early out if possible.
148 try:
149 return float(string) * mult
150 except ValueError:
151 pass
152
153 # Otherwise we need to do this the hard way.
154 def _numeric(string):
155 total = 0
156 for c in string:
157 v = unicodedata.numeric(c)
158 if v >= 1 or v == 0:
159 total *= 10
160 total += v
161 return total
162
163 try:
164 whole, frac = string.split(".")
165 whole = _numeric(whole)
166 frac = _numeric(frac) / (10.0 ** len(frac))
167 return mult * (whole + frac)
168 except ValueError:
169 return mult * _numeric(string)
170
171 def normalize_punc(string):
172 string = unicode(string.strip(u",.'"))
173 string = filter(lambda u: u.isnumeric() or u in u",.'", string)
174 commas = string.count(u",")
175 stops = string.count(u".")
176 quotes = string.count(u"'")
177
178 # If anything occurs more than once, it's a separator.
179 if commas > 1:
180 string = string.replace(u",", u"")
181 commas = 0
182 if stops > 1:
183 string = string.replace(u".", u"")
184 stops = 0
185 if quotes > 1:
186 string = string.replace(u"'", u"")
187 quotes = 0
188
189 def normalize_two(a, b, string):
190 # One of each - assume the first is grouping, second is point.
191 a_idx = string.rindex(a)
192 b_idx = string.rindex(b)
193 if a_idx > b_idx:
194 string = string.replace(b, u"").replace(a, u".")
195 else:
196 string = string.replace(a, u"").replace(b, u".")
197 return string
198
199 if commas and stops and quotes:
200 # If all three, assume the middle is the decimal point.
201 # A,AAA.BB'CC
202 # A.AAA,BB'CC
203 # A,AAA'BB.CC
204 # A.AAA'BB,CC
205 # Not really valid, so do whatever we want...
206 # A'AAA.BB,CC
207 # A'AAA,BB.CC
208 comma_idx = string.index(u",")
209 stops_idx = string.index(u".")
210 quotes_idx = string.index(u"'")
211 if (comma_idx < stops_idx < quotes_idx
212 or quotes_idx < stops_idx < comma_idx):
213 string = string.replace(u",", u"").replace(u"'", u"")
214 elif (comma_idx < quotes_idx < stops_idx
215 or stops_idx < quotes_idx < comma_idx):
216 string = string.replace(
217 u",", u"").replace(
218 u".", u"").replace(
219 u"'", u".")
220 else:
221 string = string.replace(
222 u"'", u"").replace(
223 u".", u"").replace(
224 u",", u".")
225
226 elif stops and quotes:
227 string = normalize_two(u".", u"'", string)
228
229 elif commas and quotes:
230 string = normalize_two(u",", u"'", string)
231
232 elif commas and stops:
233 string = normalize_two(u",", u".", string)
234
235 elif commas:
236 if string[-4:-3] == u"," and len(string) <= 7:
237 # Single comma as a thousands separator.
238 string = string.replace(u",", u"")
239 else:
240 # Single comma, not thousands - probably a decimal point.
241 string = string.replace(u",", u".")
242
243 elif quotes:
244 # Single quote, probably MM'SS", equivalent to a decimal point.
245 string = string.replace(u"'", u".")
246
247 elif stops and string[-4:] == ".000":
248 # Single stop, but no decimal - probably grouping.
249 string = string.replace(u".", u"")
250
251 return string