Fix typo, remove unneeded check.
[python-collate.git] / collate / strings.py
1 import unicodedata
2
3 CONTINUE_ON = frozenset([
4 "Ll", "Lm", "Lo", "Lt", "Lu",
5 "Mc", "Me", "Mn",
6 "Nd", "Nl", "No",
7 "Po",
8 "Zs",
9 ])
10
11 UNKNOWN, LETTER, NUMBER = range(3)
12
13 BREAKER = u"\u2029" # Paragraph break character
14 INFINITY = float('inf')
15
16 def sortemes(string, key=lambda s: s):
17 """Generate a list of sortemes for the string.
18
19 A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
20 sort information. This is larger than a word boundry but smaller
21 than a sentence boundry; roughly, a sorteme boundry occurs between
22 letters and numbers, between numbers and numbers if 'too much'
23 punctuation exists in between, between lines.
24
25 There is no formal specification for sortemes; the goal of this
26 function is to provide good output for Collator.sortemekey.
27 """
28
29 words = []
30 letters = []
31 digits = []
32 if not string:
33 return words
34 string = unicode(string)
35 categories = map(unicodedata.category, string)
36 previous = UNKNOWN
37
38 def stripends(word):
39 while word and unicodedata.category(word[0])[0] in "PS":
40 word = word[1:]
41 while word and unicodedata.category(word[-1])[0] in "PS":
42 word = word[:-1]
43 return word
44
45 def aletters(letters):
46 words.append((INFINITY, stripends(letters)))
47 def adigits(digits):
48 words.append((numeric(digits), u''))
49
50 # TODO(jfw): This kind of evolved over time, there's probably a much
51 # faster / more concise way to express it now.
52 for i, (c, category) in enumerate(zip(string, categories)):
53
54 if letters and previous == LETTER and words:
55 word = stripends(words.pop()[1].strip()) + BREAKER
56 letters.insert(0, word)
57 previous = UNKNOWN
58
59 # Split at the first letter following a number or
60 # non-continuing character.
61 if category[0] == "L":
62 letters.append(c)
63 if digits:
64 adigits(u"".join(digits).strip())
65 digits = []
66 previous = NUMBER
67
68 # Split at the first number following a non-number or
69 # non-continuing character.
70 elif category[0] == "N":
71 digits.append(c)
72 if letters:
73 aletters(u"".join(letters))
74 letters = []
75 previous = LETTER
76
77 # Only certain punctuation allowed in numbers.
78 elif digits and c not in "',._":
79 adigits(u"".join(digits))
80 digits = []
81 previous = NUMBER
82
83 # Split if we find a non-continuing character ("weird" ones).
84 elif letters and category not in CONTINUE_ON:
85 if letters:
86 aletters(u"".join(letters).strip() + BREAKER)
87 letters = []
88 previous = LETTER
89 if digits:
90 adigits(u"".join(digits).strip())
91 digits = []
92 previous = NUMBER
93
94 # Split if we find two pieces of punctuation in a row, even
95 # if we should otherwise continue.
96 elif i and categories[i-1][0] in "P" and category[0] in "P":
97 if letters:
98 aletters(u"".join(letters))
99 letters = []
100 previous = LETTER
101 if digits:
102 adigits(u"".join(digits))
103 digits = []
104 previous = NUMBER
105
106 else:
107 if digits:
108 digits.append(c)
109 elif letters:
110 letters.append(c)
111
112 if letters and previous == LETTER and words:
113 word = stripends(words.pop()[1].strip()) + BREAKER
114 letters.insert(0, word)
115 previous = UNKNOWN
116
117 if letters:
118 aletters(u"".join(letters))
119 if digits:
120 adigits(u"".join(digits))
121
122 return [(i, key(w) if w else u'') for i, w in words]
123
124 def numeric(orig, invalid=INFINITY):
125 if not orig:
126 return invalid
127
128 string = unicode(orig)
129 for c in string:
130 if c.isnumeric():
131 break
132 else:
133 return invalid
134
135 mult = 1
136 while string[:1] == u"-" or string[:1] == u"+":
137 if string[:1] == u"-":
138 mult = -mult
139 string = string[1:]
140
141 if not string[:1].isnumeric():
142 return (invalid, orig)
143
144 string = normalize_punc(string)
145
146 # Otherwise we need to do this the hard way.
147 def _numeric(string):
148 total = 0
149 for c in string:
150 v = unicodedata.numeric(c)
151 if v >= 1 or v == 0:
152 total *= 10
153 total += v
154 return total
155
156 try:
157 whole, frac = string.split(".")
158 whole = _numeric(whole)
159 frac = _numeric(frac) / (10.0 ** len(frac))
160 return mult * (whole + frac)
161 except ValueError:
162 return mult * _numeric(string)
163
164 def normalize_punc(string):
165 string = unicode(string.strip(u",.'"))
166 string = filter(lambda u: u.isnumeric() or u in u",.'", string)
167 commas = string.count(u",")
168 stops = string.count(u".")
169 quotes = string.count(u"'")
170
171 # If anything occurs more than once, it's a separator.
172 if commas > 1:
173 string = string.replace(u",", u"")
174 commas = 0
175 if stops > 1:
176 string = string.replace(u".", u"")
177 stops = 0
178 if quotes > 1:
179 string = string.replace(u"'", u"")
180 quotes = 0
181
182 def normalize_two(a, b, string):
183 # One of each - assume the first is grouping, second is point.
184 a_idx = string.rindex(a)
185 b_idx = string.rindex(b)
186 if a_idx > b_idx:
187 string = string.replace(b, u"").replace(a, u".")
188 else:
189 string = string.replace(a, u"").replace(b, u".")
190 return string
191
192 if commas and stops and quotes:
193 # If all three, assume the middle is the decimal point.
194 # A,AAA.BB'CC
195 # A.AAA,BB'CC
196 # A,AAA'BB.CC
197 # A.AAA'BB,CC
198 # Not really valid, so do whatever we want...
199 # A'AAA.BB,CC
200 # A'AAA,BB.CC
201 comma_idx = string.index(u",")
202 stops_idx = string.index(u".")
203 quotes_idx = string.index(u"'")
204 if (comma_idx < stops_idx < quotes_idx
205 or quotes_idx < stops_idx < comma_idx):
206 string = string.replace(u",", u"").replace(u"'", u"")
207 elif (comma_idx < quotes_idx < stops_idx
208 or stops_idx < quotes_idx < comma_idx):
209 string = string.replace(
210 u",", u"").replace(
211 u".", u"").replace(
212 u"'", u".")
213 else:
214 string = string.replace(
215 u"'", u"").replace(
216 u".", u"").replace(
217 u",", u".")
218
219 elif stops and quotes:
220 string = normalize_two(u".", u"'", string)
221
222 elif commas and quotes:
223 string = normalize_two(u",", u"'", string)
224
225 elif commas and stops:
226 string = normalize_two(u",", u".", string)
227
228 elif commas:
229 if string[-4:-3] == u"," and len(string) <= 7:
230 # Single comma as a thousands separator.
231 string = string.replace(u",", u"")
232 else:
233 # Single comma, not thousands - probably a decimal point.
234 string = string.replace(u",", u".")
235
236 elif quotes:
237 # Single quote, probably MM'SS", equivalent to a decimal point.
238 string = string.replace(u"'", u".")
239
240 elif stops and string[-4:] == ".000":
241 # Single stop, but no decimal - probably grouping.
242 string = string.replace(u".", u"")
243
244 return string