dde1739250868090247e6de736848f9b9b594b1d
[python-collate.git] / collate / _strings.py
1 import unicodedata
2
3 CONTINUE_ON = frozenset([
4 "Ll", "Lm", "Lo", "Lt", "Lu",
5 "Mc", "Me", "Mn",
6 "Nd", "Nl", "No",
7 "Po",
8 "Zs",
9 ])
10
11 UNKNOWN, LETTER, NUMBER = range(3)
12
13 def sortemes(string):
14 """Generate a list of sortemes for the string.
15
16 A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
17 sort information. This is larger than a word boundry but smaller
18 than a sentence boundry; roughly, a sorteme boundry occurs between
19 letters and numbers, between numbers and numbrs if 'too much'
20 punctuation exists in between, between lines.
21
22 There is no formal specification for sortemes; the goal of this
23 function is to provide good output for Collator.sortemekey.
24 """
25
26 words = []
27 if not string:
28 return words
29 string = unicode(string)
30 start = None
31 last = None
32 mode = UNKNOWN
33 previous_mode = UNKNOWN
34 category = "XX"
35 for i, c in enumerate(string):
36 broke = False
37 prev_category = category
38 this_mode = mode
39 category = unicodedata.category(c)
40
41 # Split at the first letter following a number or
42 # non-continuing character.
43 if category[0] == "L":
44 if mode != LETTER:
45 broke = True
46 mode = LETTER
47
48 # Split at the first number following a non-number or
49 # non-continuing character.
50 elif category[0] == "N":
51 if mode != NUMBER:
52 broke = True
53 mode = NUMBER
54
55 # Split if we find a non-continuing character ("weird" ones).
56 elif category not in CONTINUE_ON:
57 broke = True
58 mode = UNKNOWN
59
60 # Only certain punctuation allowed in numbers.
61 elif mode == NUMBER and category[0] == "P" and c not in "',._":
62 broke = True
63 mode = UNKNOWN
64
65 # Split if we find two pieces of punctuation in a row, even
66 # if we should otherwise continue.
67 elif i > 0 and prev_category[0] == "P" and category[0] == "P":
68 broke = True
69 mode = UNKNOWN
70
71 if broke and start is not None and last is not None:
72 # If we read two strings separated by weird punctuation,
73 # pretend the punctuation isn't there.
74 if (this_mode == previous_mode == LETTER
75 and prev_category[0] == "P"
76 and words):
77 words[-1] += u" " + string[start:last+1]
78 else:
79 # This ensures "foo2 bar" sorts as ["foo ", 2, "bar"]
80 # Which sorts after ["foo", "bar"].
81 if this_mode == NUMBER and previous_mode == LETTER and words:
82 words[-1] += u" "
83 words.append(string[start:last+1])
84 previous_mode = this_mode
85
86 if broke:
87 start = i
88 last = None
89 if category[0] in "LN":
90 last = i
91 if start is not None and last is not None:
92 if this_mode == previous_mode == LETTER and words:
93 words[-1] += u" " + string[start:last+1]
94 else:
95 if this_mode == NUMBER and previous_mode == LETTER and words:
96 words[-1] += u" "
97 words.append(string[start:last+1])
98 return words
99
100 def numeric(orig, invalid=float('inf')):
101 if not orig:
102 return (invalid, '')
103
104 string = unicode(orig)
105 for c in string:
106 if c.isnumeric():
107 break
108 else:
109 return (invalid, orig)
110
111 mult = 1
112 while string[:1] == u"-" or string[:1] == u"+":
113 if string[:1] == u"-":
114 mult = -mult
115 string = string[1:]
116
117 if not string[:1].isnumeric():
118 return (invalid, orig)
119
120 string = normalize_punc(string)
121
122 # Early out if possible.
123 try:
124 return (float(string) * mult, orig)
125 except ValueError:
126 pass
127
128 # Otherwise we need to do this the hard way.
129 def _numeric(string):
130 total = 0
131 for c in string:
132 v = unicodedata.numeric(c)
133 if v >= 1 or v == 0:
134 total *= 10
135 total += v
136 return total
137
138 try:
139 whole, frac = string.split(".")
140 whole = _numeric(whole)
141 frac = _numeric(frac) / (10.0 ** len(frac))
142 return (mult * (whole + frac), orig)
143 except ValueError:
144 return (mult * _numeric(string), orig)
145
146 def normalize_punc(string):
147 string = unicode(string.strip(u",.'"))
148 string = filter(lambda u: u.isnumeric() or u in u",.'", string)
149 commas = string.count(u",")
150 stops = string.count(u".")
151 quotes = string.count(u"'")
152
153 # If anything occurs more than once, it's a separator.
154 if commas > 1:
155 string = string.replace(u",", u"")
156 commas = 0
157 if stops > 1:
158 string = string.replace(u".", u"")
159 stops = 0
160 if quotes > 1:
161 string = string.replace(u"'", u"")
162 quotes = 0
163
164 def normalize_two(a, b, string):
165 # One of each - assume the first is grouping, second is point.
166 a_idx = string.rindex(a)
167 b_idx = string.rindex(b)
168 if a_idx > b_idx:
169 string = string.replace(b, u"").replace(a, u".")
170 else:
171 string = string.replace(a, u"").replace(b, u".")
172 return string
173
174 if commas and stops and quotes:
175 # If all three, assume the middle is the decimal point.
176 # A,AAA.BB'CC
177 # A.AAA,BB'CC
178 # A,AAA'BB.CC
179 # A.AAA'BB,CC
180 # Not really valid, so do whatever we want...
181 # A'AAA.BB,CC
182 # A'AAA,BB.CC
183 comma_idx = string.index(u",")
184 stops_idx = string.index(u".")
185 quotes_idx = string.index(u"'")
186 if (comma_idx < stops_idx < quotes_idx
187 or quotes_idx < stops_idx < comma_idx):
188 string = string.replace(u",", u"").replace(u"'", u"")
189 elif (comma_idx < quotes_idx < stops_idx
190 or stops_idx < quotes_idx < comma_idx):
191 string = string.replace(
192 u",", u"").replace(
193 u".", u"").replace(
194 u"'", u".")
195 else:
196 string = string.replace(
197 u"'", u"").replace(
198 u".", u"").replace(
199 u",", u".")
200
201 elif stops and quotes:
202 string = normalize_two(u".", u"'", string)
203
204 elif commas and quotes:
205 string = normalize_two(u",", u"'", string)
206
207 elif commas and stops:
208 string = normalize_two(u",", u".", string)
209
210 elif commas:
211 if string[-4:-3] == u"," and len(string) <= 7:
212 # Single comma as a thousands separator.
213 string = string.replace(u",", u"")
214 else:
215 # Single comma, not thousands - probably a decimal point.
216 string = string.replace(u",", u".")
217
218 elif quotes:
219 # Single quote, probably MM'SS", equivalent to a decimal point.
220 string = string.replace(u"'", u".")
221
222 elif stops and string[-4:] == ".000":
223 # Single stop, but no decimal - probably grouping.
224 string = string.replace(u".", u"")
225
226 return string