Installation docs.
[python-collate.git] / collate / _strings.py
1 import unicodedata
2
3 CONTINUE_ON = frozenset([
4 "Ll", "Lm", "Lo", "Lt", "Lu",
5 "Mc", "Me", "Mn",
6 "Nd", "Nl", "No",
7 "Po",
8 "Zs",
9 ])
10
11 UNKNOWN, LETTER, NUMBER = range(3)
12
13 def sortemes(string):
14 """Generate a list of sortemes for the string.
15
16 A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
17 sort information. This is larger than a word boundry but smaller
18 than a sentence boundry; roughly, a sorteme boundry occurs between
19 letters and numbers, between numbers and numbrs if 'too much'
20 punctuation exists in between, between lines.
21
22 There is no formal specification for sortemes; the goal of this
23 function is to provide good output for Collator.sortemekey.
24 """
25
26 words = []
27 if not string:
28 return words
29 string = unicode(string)
30 start = None
31 last = None
32 mode = UNKNOWN
33 previous_mode = UNKNOWN
34 category = "XX"
35 for i, c in enumerate(string):
36 broke = False
37 prev_category = category
38 this_mode = mode
39 category = unicodedata.category(c)
40
41 # Split at the first letter following a number or
42 # non-continuing character.
43 if category[0] == "L":
44 if mode != LETTER:
45 broke = True
46 mode = LETTER
47
48 # Split at the first number following a non-number or
49 # non-continuing character.
50 elif category[0] == "N":
51 if mode != NUMBER:
52 broke = True
53 mode = NUMBER
54
55 # Split if we find a non-continuing character ("weird" ones).
56 elif category not in CONTINUE_ON:
57 broke = True
58 mode = UNKNOWN
59
60 # Only certain punctuation allowed in numbers.
61 elif mode == NUMBER and category[0] == "P" and c not in "',._":
62 broke = True
63 mode = UNKNOWN
64
65 # Split if we find two pieces of punctuation in a row, even
66 # if we should otherwise continue.
67 elif i > 0 and prev_category[0] == "P" and category[0] == "P":
68 broke = True
69 mode = UNKNOWN
70
71 if broke and start is not None and last is not None:
72 # If we read two strings separated by weird punctuation,
73 # pretend the punctuation isn't there.
74 if (this_mode == previous_mode == LETTER
75 and (category[0] == "P" or prev_category[0] == "P")
76 and words):
77 words[-1] += u" " + string[start:last+1]
78 else:
79 # This ensures "foo2 bar" sorts as ["foo ", 2, "bar"]
80 # Which sorts after ["foo", "bar"].
81 if this_mode == NUMBER and previous_mode == LETTER and words:
82 words[-1] += u" "
83 words.append(string[start:last+1])
84 previous_mode = this_mode
85
86 if broke:
87 start = i
88 last = None
89 if category[0] in "LN":
90 last = i
91 this_mode = mode
92 if start is not None and last is not None:
93 if this_mode == LETTER and previous_mode == LETTER and words:
94 words[-1] += u" " + string[start:last+1]
95 else:
96 if this_mode == NUMBER and previous_mode == LETTER and words:
97 words[-1] += u" "
98 words.append(string[start:last+1])
99 return words
100
101 def numeric(orig, invalid=float('inf')):
102 if not orig:
103 return (invalid, '')
104
105 string = unicode(orig)
106 for c in string:
107 if c.isnumeric():
108 break
109 else:
110 return (invalid, orig)
111
112 mult = 1
113 while string[:1] == u"-" or string[:1] == u"+":
114 if string[:1] == u"-":
115 mult = -mult
116 string = string[1:]
117
118 if not string[:1].isnumeric():
119 return (invalid, orig)
120
121 string = normalize_punc(string)
122
123 # Early out if possible.
124 try:
125 return (float(string) * mult, orig)
126 except ValueError:
127 pass
128
129 # Otherwise we need to do this the hard way.
130 def _numeric(string):
131 total = 0
132 for c in string:
133 v = unicodedata.numeric(c)
134 if v >= 1 or v == 0:
135 total *= 10
136 total += v
137 return total
138
139 try:
140 whole, frac = string.split(".")
141 whole = _numeric(whole)
142 frac = _numeric(frac) / (10.0 ** len(frac))
143 return (mult * (whole + frac), orig)
144 except ValueError:
145 return (mult * _numeric(string), orig)
146
147 def normalize_punc(string):
148 string = unicode(string.strip(u",.'"))
149 string = filter(lambda u: u.isnumeric() or u in u",.'", string)
150 commas = string.count(u",")
151 stops = string.count(u".")
152 quotes = string.count(u"'")
153
154 # If anything occurs more than once, it's a separator.
155 if commas > 1:
156 string = string.replace(u",", u"")
157 commas = 0
158 if stops > 1:
159 string = string.replace(u".", u"")
160 stops = 0
161 if quotes > 1:
162 string = string.replace(u"'", u"")
163 quotes = 0
164
165 def normalize_two(a, b, string):
166 # One of each - assume the first is grouping, second is point.
167 a_idx = string.rindex(a)
168 b_idx = string.rindex(b)
169 if a_idx > b_idx:
170 string = string.replace(b, u"").replace(a, u".")
171 else:
172 string = string.replace(a, u"").replace(b, u".")
173 return string
174
175 if commas and stops and quotes:
176 # If all three, assume the middle is the decimal point.
177 # A,AAA.BB'CC
178 # A.AAA,BB'CC
179 # A,AAA'BB.CC
180 # A.AAA'BB,CC
181 # Not really valid, so do whatever we want...
182 # A'AAA.BB,CC
183 # A'AAA,BB.CC
184 comma_idx = string.index(u",")
185 stops_idx = string.index(u".")
186 quotes_idx = string.index(u"'")
187 if (comma_idx < stops_idx < quotes_idx
188 or quotes_idx < stops_idx < comma_idx):
189 string = string.replace(u",", u"").replace(u"'", u"")
190 elif (comma_idx < quotes_idx < stops_idx
191 or stops_idx < quotes_idx < comma_idx):
192 string = string.replace(
193 u",", u"").replace(
194 u".", u"").replace(
195 u"'", u".")
196 else:
197 string = string.replace(
198 u"'", u"").replace(
199 u".", u"").replace(
200 u",", u".")
201
202 elif stops and quotes:
203 string = normalize_two(u".", u"'", string)
204
205 elif commas and quotes:
206 string = normalize_two(u",", u"'", string)
207
208 elif commas and stops:
209 string = normalize_two(u",", u".", string)
210
211 elif commas:
212 if string[-4:-3] == u"," and len(string) <= 7:
213 # Single comma as a thousands separator.
214 string = string.replace(u",", u"")
215 else:
216 # Single comma, not thousands - probably a decimal point.
217 string = string.replace(u",", u".")
218
219 elif quotes:
220 # Single quote, probably MM'SS", equivalent to a decimal point.
221 string = string.replace(u"'", u".")
222
223 elif stops and string[-4:] == ".000":
224 # Single stop, but no decimal - probably grouping.
225 string = string.replace(u".", u"")
226
227 return string