267c6e573fc23acbb52425331708dce4b2c4c426
[python-collate.git] / collate / strings.py
1 import unicodedata
2
3 CONTINUE_ON = frozenset([
4 "Ll", "Lm", "Lo", "Lt", "Lu",
5 "Mc", "Me", "Mn",
6 "Nd", "Nl", "No",
7 "Po",
8 "Zs",
9 ])
10
11 UNKNOWN, LETTER, NUMBER = range(3)
12
13 BREAKER = u"\u2029"
14
15 def sortemes(string):
16 """Generate a list of sortemes for the string.
17
18 A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
19 sort information. This is larger than a word boundry but smaller
20 than a sentence boundry; roughly, a sorteme boundry occurs between
21 letters and numbers, between numbers and numbrs if 'too much'
22 punctuation exists in between, between lines.
23
24 There is no formal specification for sortemes; the goal of this
25 function is to provide good output for Collator.sortemekey.
26 """
27
28 words = []
29 if not string:
30 return words
31 string = unicode(string)
32 start = None
33 last = None
34 mode = UNKNOWN
35 previous_mode = UNKNOWN
36 category = "XX"
37
38 # TODO(jfw): This kind of evolved over time, there's probably a much
39 # faster / more concise way to express it now.
40 for i, c in enumerate(string):
41 broke = False
42 prev_category = category
43 this_mode = mode
44 category = unicodedata.category(c)
45
46 # Split at the first letter following a number or
47 # non-continuing character.
48 if category[0] == "L":
49 if mode != LETTER:
50 broke = True
51 mode = LETTER
52
53 # Split at the first number following a non-number or
54 # non-continuing character.
55 elif category[0] == "N":
56 if mode != NUMBER:
57 broke = True
58 mode = NUMBER
59
60 # Split if we find a non-continuing character ("weird" ones).
61 elif category not in CONTINUE_ON:
62 broke = True
63 mode = UNKNOWN
64
65 # Only certain punctuation allowed in numbers.
66 elif mode == NUMBER and category[0] == "P" and c not in "',._":
67 broke = True
68 mode = UNKNOWN
69
70 # Split if we find two pieces of punctuation in a row, even
71 # if we should otherwise continue.
72 elif prev_category[0] in "P" and category[0] in "P":
73 broke = True
74 mode = UNKNOWN
75
76 if broke and start is not None and last is not None:
77 # If we read two strings separated by weird punctuation,
78 # pretend the punctuation isn't there.
79 if this_mode == previous_mode == LETTER:
80 words[-1] += BREAKER + string[start:last+1]
81 else:
82 if this_mode == NUMBER and previous_mode == LETTER:
83 words[-1] += BREAKER
84 words.append(string[start:last+1])
85 previous_mode = this_mode
86
87 if broke:
88 start = i
89 last = None
90 if category[0] in "LN":
91 last = i
92 this_mode = mode
93 if start is not None and last is not None:
94 if this_mode == LETTER and previous_mode == LETTER and words:
95 words[-1] += BREAKER + string[start:last+1]
96 else:
97 if this_mode == NUMBER and previous_mode == LETTER and words:
98 words[-1] += BREAKER
99 words.append(string[start:last+1])
100 return words
101
102 def numeric(orig, invalid=float('inf')):
103 if not orig:
104 return (invalid, '')
105
106 string = unicode(orig)
107 for c in string:
108 if c.isnumeric():
109 break
110 else:
111 return (invalid, orig)
112
113 mult = 1
114 while string[:1] == u"-" or string[:1] == u"+":
115 if string[:1] == u"-":
116 mult = -mult
117 string = string[1:]
118
119 if not string[:1].isnumeric():
120 return (invalid, orig)
121
122 string = normalize_punc(string)
123
124 # Early out if possible.
125 try:
126 return (float(string) * mult, orig)
127 except ValueError:
128 pass
129
130 # Otherwise we need to do this the hard way.
131 def _numeric(string):
132 total = 0
133 for c in string:
134 v = unicodedata.numeric(c)
135 if v >= 1 or v == 0:
136 total *= 10
137 total += v
138 return total
139
140 try:
141 whole, frac = string.split(".")
142 whole = _numeric(whole)
143 frac = _numeric(frac) / (10.0 ** len(frac))
144 return (mult * (whole + frac), orig)
145 except ValueError:
146 return (mult * _numeric(string), orig)
147
148 def normalize_punc(string):
149 string = unicode(string.strip(u",.'"))
150 string = filter(lambda u: u.isnumeric() or u in u",.'", string)
151 commas = string.count(u",")
152 stops = string.count(u".")
153 quotes = string.count(u"'")
154
155 # If anything occurs more than once, it's a separator.
156 if commas > 1:
157 string = string.replace(u",", u"")
158 commas = 0
159 if stops > 1:
160 string = string.replace(u".", u"")
161 stops = 0
162 if quotes > 1:
163 string = string.replace(u"'", u"")
164 quotes = 0
165
166 def normalize_two(a, b, string):
167 # One of each - assume the first is grouping, second is point.
168 a_idx = string.rindex(a)
169 b_idx = string.rindex(b)
170 if a_idx > b_idx:
171 string = string.replace(b, u"").replace(a, u".")
172 else:
173 string = string.replace(a, u"").replace(b, u".")
174 return string
175
176 if commas and stops and quotes:
177 # If all three, assume the middle is the decimal point.
178 # A,AAA.BB'CC
179 # A.AAA,BB'CC
180 # A,AAA'BB.CC
181 # A.AAA'BB,CC
182 # Not really valid, so do whatever we want...
183 # A'AAA.BB,CC
184 # A'AAA,BB.CC
185 comma_idx = string.index(u",")
186 stops_idx = string.index(u".")
187 quotes_idx = string.index(u"'")
188 if (comma_idx < stops_idx < quotes_idx
189 or quotes_idx < stops_idx < comma_idx):
190 string = string.replace(u",", u"").replace(u"'", u"")
191 elif (comma_idx < quotes_idx < stops_idx
192 or stops_idx < quotes_idx < comma_idx):
193 string = string.replace(
194 u",", u"").replace(
195 u".", u"").replace(
196 u"'", u".")
197 else:
198 string = string.replace(
199 u"'", u"").replace(
200 u".", u"").replace(
201 u",", u".")
202
203 elif stops and quotes:
204 string = normalize_two(u".", u"'", string)
205
206 elif commas and quotes:
207 string = normalize_two(u",", u"'", string)
208
209 elif commas and stops:
210 string = normalize_two(u",", u".", string)
211
212 elif commas:
213 if string[-4:-3] == u"," and len(string) <= 7:
214 # Single comma as a thousands separator.
215 string = string.replace(u",", u"")
216 else:
217 # Single comma, not thousands - probably a decimal point.
218 string = string.replace(u",", u".")
219
220 elif quotes:
221 # Single quote, probably MM'SS", equivalent to a decimal point.
222 string = string.replace(u"'", u".")
223
224 elif stops and string[-4:] == ".000":
225 # Single stop, but no decimal - probably grouping.
226 string = string.replace(u".", u"")
227
228 return string