More tweaks; notably try to insert paragraph breaks rather than a separate Python...
[python-collate.git] / collate / strings.py
1 import unicodedata
2
3 CONTINUE_ON = frozenset([
4 "Ll", "Lm", "Lo", "Lt", "Lu",
5 "Mc", "Me", "Mn",
6 "Nd", "Nl", "No",
7 "Po",
8 "Zs",
9 ])
10
11 UNKNOWN, LETTER, NUMBER = range(3)
12
13 BREAKER = u"\u2029"
14
15 def sortemes(string):
16 """Generate a list of sortemes for the string.
17
18 A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
19 sort information. This is larger than a word boundry but smaller
20 than a sentence boundry; roughly, a sorteme boundry occurs between
21 letters and numbers, between numbers and numbrs if 'too much'
22 punctuation exists in between, between lines.
23
24 There is no formal specification for sortemes; the goal of this
25 function is to provide good output for Collator.sortemekey.
26 """
27
28 words = []
29 if not string:
30 return words
31 string = unicode(string)
32 start = None
33 last = None
34 mode = UNKNOWN
35 previous_mode = UNKNOWN
36 category = "XX"
37
38 # TODO(jfw): This kind of evolved over time, there's probably a much
39 # faster / more concise way to express it now.
40 for i, c in enumerate(string):
41 broke = False
42 prev_category = category
43 this_mode = mode
44 category = unicodedata.category(c)
45
46 # Split at the first letter following a number or
47 # non-continuing character.
48 if category[0] == "L":
49 if mode != LETTER:
50 broke = True
51 mode = LETTER
52
53 # Split at the first number following a non-number or
54 # non-continuing character.
55 elif category[0] == "N":
56 if mode != NUMBER:
57 broke = True
58 mode = NUMBER
59
60 # Split if we find a non-continuing character ("weird" ones).
61 elif category not in CONTINUE_ON:
62 broke = True
63 mode = UNKNOWN
64
65 # Only certain punctuation allowed in numbers.
66 elif mode == NUMBER and category[0] == "P" and c not in "',._":
67 broke = True
68 mode = UNKNOWN
69
70 # Split if we find two pieces of punctuation in a row, even
71 # if we should otherwise continue.
72 elif i > 0 and prev_category[0] == "P" and category[0] == "P":
73 broke = True
74 mode = UNKNOWN
75
76 if broke and start is not None and last is not None:
77 # If we read two strings separated by weird punctuation,
78 # pretend the punctuation isn't there.
79 if (this_mode == previous_mode == LETTER
80 and words):
81 words[-1] += BREAKER + string[start:last+1]
82 else:
83 # This ensures "foo2 bar" sorts as ["foo ", 2, "bar"]
84 # Which sorts after ["foo", "bar"].
85 if this_mode == NUMBER and previous_mode == LETTER and words:
86 words[-1] += BREAKER
87 words.append(string[start:last+1])
88 previous_mode = this_mode
89
90 if broke:
91 start = i
92 last = None
93 if category[0] in "LN":
94 last = i
95 this_mode = mode
96 if start is not None and last is not None:
97 if this_mode == LETTER and previous_mode == LETTER and words:
98 words[-1] += BREAKER + string[start:last+1]
99 else:
100 if this_mode == NUMBER and previous_mode == LETTER and words:
101 words[-1] += BREAKER
102 words.append(string[start:last+1])
103 return words
104
105 def numeric(orig, invalid=float('inf')):
106 if not orig:
107 return (invalid, '')
108
109 string = unicode(orig)
110 for c in string:
111 if c.isnumeric():
112 break
113 else:
114 return (invalid, orig)
115
116 mult = 1
117 while string[:1] == u"-" or string[:1] == u"+":
118 if string[:1] == u"-":
119 mult = -mult
120 string = string[1:]
121
122 if not string[:1].isnumeric():
123 return (invalid, orig)
124
125 string = normalize_punc(string)
126
127 # Early out if possible.
128 try:
129 return (float(string) * mult, orig)
130 except ValueError:
131 pass
132
133 # Otherwise we need to do this the hard way.
134 def _numeric(string):
135 total = 0
136 for c in string:
137 v = unicodedata.numeric(c)
138 if v >= 1 or v == 0:
139 total *= 10
140 total += v
141 return total
142
143 try:
144 whole, frac = string.split(".")
145 whole = _numeric(whole)
146 frac = _numeric(frac) / (10.0 ** len(frac))
147 return (mult * (whole + frac), orig)
148 except ValueError:
149 return (mult * _numeric(string), orig)
150
151 def normalize_punc(string):
152 string = unicode(string.strip(u",.'"))
153 string = filter(lambda u: u.isnumeric() or u in u",.'", string)
154 commas = string.count(u",")
155 stops = string.count(u".")
156 quotes = string.count(u"'")
157
158 # If anything occurs more than once, it's a separator.
159 if commas > 1:
160 string = string.replace(u",", u"")
161 commas = 0
162 if stops > 1:
163 string = string.replace(u".", u"")
164 stops = 0
165 if quotes > 1:
166 string = string.replace(u"'", u"")
167 quotes = 0
168
169 def normalize_two(a, b, string):
170 # One of each - assume the first is grouping, second is point.
171 a_idx = string.rindex(a)
172 b_idx = string.rindex(b)
173 if a_idx > b_idx:
174 string = string.replace(b, u"").replace(a, u".")
175 else:
176 string = string.replace(a, u"").replace(b, u".")
177 return string
178
179 if commas and stops and quotes:
180 # If all three, assume the middle is the decimal point.
181 # A,AAA.BB'CC
182 # A.AAA,BB'CC
183 # A,AAA'BB.CC
184 # A.AAA'BB,CC
185 # Not really valid, so do whatever we want...
186 # A'AAA.BB,CC
187 # A'AAA,BB.CC
188 comma_idx = string.index(u",")
189 stops_idx = string.index(u".")
190 quotes_idx = string.index(u"'")
191 if (comma_idx < stops_idx < quotes_idx
192 or quotes_idx < stops_idx < comma_idx):
193 string = string.replace(u",", u"").replace(u"'", u"")
194 elif (comma_idx < quotes_idx < stops_idx
195 or stops_idx < quotes_idx < comma_idx):
196 string = string.replace(
197 u",", u"").replace(
198 u".", u"").replace(
199 u"'", u".")
200 else:
201 string = string.replace(
202 u"'", u"").replace(
203 u".", u"").replace(
204 u",", u".")
205
206 elif stops and quotes:
207 string = normalize_two(u".", u"'", string)
208
209 elif commas and quotes:
210 string = normalize_two(u",", u"'", string)
211
212 elif commas and stops:
213 string = normalize_two(u",", u".", string)
214
215 elif commas:
216 if string[-4:-3] == u"," and len(string) <= 7:
217 # Single comma as a thousands separator.
218 string = string.replace(u",", u"")
219 else:
220 # Single comma, not thousands - probably a decimal point.
221 string = string.replace(u",", u".")
222
223 elif quotes:
224 # Single quote, probably MM'SS", equivalent to a decimal point.
225 string = string.replace(u"'", u".")
226
227 elif stops and string[-4:] == ".000":
228 # Single stop, but no decimal - probably grouping.
229 string = string.replace(u".", u"")
230
231 return string