Test quotes.
[python-collate.git] / collate / strings.py
1 import unicodedata
3 CONTINUE_ON = frozenset([
4 "Ll", "Lm", "Lo", "Lt", "Lu",
5 "Mc", "Me", "Mn",
6 "Nd", "Nl", "No",
7 "Po",
8 "Zs",
9 ])
13 BREAKER = u"\u2029" # Paragraph break character
14 INFINITY = float('inf')
16 def sortemes(string, key=lambda s: s):
17 """Generate a list of sortemes for the string.
19 A sorteme, by analogy with grapheme/morpheme/etc. is an atom of
20 sort information. This is larger than a word boundry but smaller
21 than a sentence boundry; roughly, a sorteme boundry occurs between
22 letters and numbers, between numbers and numbrs if 'too much'
23 punctuation exists in between, between lines.
25 There is no formal specification for sortemes; the goal of this
26 function is to provide good output for Collator.sortemekey.
27 """
29 words = []
30 letters = []
31 digits = []
32 if not string:
33 return words
34 string = unicode(string)
35 categories = map(unicodedata.category, string)
36 previous = UNKNOWN
37 types = []
39 def stripends(word):
40 while word and unicodedata.category(word[0])[0] in "PS":
41 word = word[1:]
42 while word and unicodedata.category(word[-1])[0] in "PS":
43 word = word[:-1]
44 return word
46 def aletters(letters):
47 words.append((INFINITY, stripends(letters)))
48 def adigits(digits):
49 words.append((numeric(digits), u''))
51 # TODO(jfw): This kind of evolved over time, there's probably a much
52 # faster / more concise way to express it now.
53 for i, (c, category) in enumerate(zip(string, categories)):
55 if letters and previous == LETTER and words:
56 word = stripends(words.pop()[1].strip()) + BREAKER
57 letters.insert(0, word)
58 previous = UNKNOWN
60 # Split at the first letter following a number or
61 # non-continuing character.
62 if category[0] == "L":
63 letters.append(c)
64 if digits:
65 adigits(u"".join(digits).strip())
66 digits = []
67 previous = NUMBER
69 # Split at the first number following a non-number or
70 # non-continuing character.
71 elif category[0] == "N":
72 digits.append(c)
73 if letters:
74 aletters(u"".join(letters))
75 letters = []
76 previous = LETTER
78 # Only certain punctuation allowed in numbers.
79 elif digits and c not in "',._":
80 adigits(u"".join(digits))
81 digits = []
82 previous = NUMBER
84 # Split if we find a non-continuing character ("weird" ones).
85 elif letters and category not in CONTINUE_ON:
86 if letters:
87 aletters(u"".join(letters).strip() + BREAKER)
88 letters = []
89 previous = LETTER
90 if digits:
91 adigits(u"".join(digits).strip())
92 digits = []
93 previous = NUMBER
95 # Split if we find two pieces of punctuation in a row, even
96 # if we should otherwise continue.
97 elif i and categories[i-1][0] in "P" and category[0] in "P":
98 if letters:
99 aletters(u"".join(letters))
100 letters = []
101 previous = LETTER
102 if digits:
103 adigits(u"".join(digits))
104 digits = []
105 previous = NUMBER
107 else:
108 if digits:
109 digits.append(c)
110 elif letters:
111 letters.append(c)
113 if letters and previous == LETTER and words:
114 word = stripends(words.pop()[1].strip()) + BREAKER
115 letters.insert(0, word)
116 previous = UNKNOWN
118 if letters:
119 aletters(u"".join(letters))
120 if digits:
121 adigits(u"".join(digits))
123 return [(i, key(w) if w else u'') for i, w in words]
125 def numeric(orig, invalid=float('inf')):
126 if not orig:
127 return invalid
129 string = unicode(orig)
130 for c in string:
131 if c.isnumeric():
132 break
133 else:
134 return invalid
136 mult = 1
137 while string[:1] == u"-" or string[:1] == u"+":
138 if string[:1] == u"-":
139 mult = -mult
140 string = string[1:]
142 if not string[:1].isnumeric():
143 return (invalid, orig)
145 string = normalize_punc(string)
147 # Early out if possible.
148 try:
149 return float(string) * mult
150 except ValueError:
151 pass
153 # Otherwise we need to do this the hard way.
154 def _numeric(string):
155 total = 0
156 for c in string:
157 v = unicodedata.numeric(c)
158 if v >= 1 or v == 0:
159 total *= 10
160 total += v
161 return total
163 try:
164 whole, frac = string.split(".")
165 whole = _numeric(whole)
166 frac = _numeric(frac) / (10.0 ** len(frac))
167 return mult * (whole + frac)
168 except ValueError:
169 return mult * _numeric(string)
171 def normalize_punc(string):
172 string = unicode(string.strip(u",.'"))
173 string = filter(lambda u: u.isnumeric() or u in u",.'", string)
174 commas = string.count(u",")
175 stops = string.count(u".")
176 quotes = string.count(u"'")
178 # If anything occurs more than once, it's a separator.
179 if commas > 1:
180 string = string.replace(u",", u"")
181 commas = 0
182 if stops > 1:
183 string = string.replace(u".", u"")
184 stops = 0
185 if quotes > 1:
186 string = string.replace(u"'", u"")
187 quotes = 0
189 def normalize_two(a, b, string):
190 # One of each - assume the first is grouping, second is point.
191 a_idx = string.rindex(a)
192 b_idx = string.rindex(b)
193 if a_idx > b_idx:
194 string = string.replace(b, u"").replace(a, u".")
195 else:
196 string = string.replace(a, u"").replace(b, u".")
197 return string
199 if commas and stops and quotes:
200 # If all three, assume the middle is the decimal point.
201 # A,AAA.BB'CC
202 # A.AAA,BB'CC
203 # A,AAA'BB.CC
204 # A.AAA'BB,CC
205 # Not really valid, so do whatever we want...
206 # A'AAA.BB,CC
207 # A'AAA,BB.CC
208 comma_idx = string.index(u",")
209 stops_idx = string.index(u".")
210 quotes_idx = string.index(u"'")
211 if (comma_idx < stops_idx < quotes_idx
212 or quotes_idx < stops_idx < comma_idx):
213 string = string.replace(u",", u"").replace(u"'", u"")
214 elif (comma_idx < quotes_idx < stops_idx
215 or stops_idx < quotes_idx < comma_idx):
216 string = string.replace(
217 u",", u"").replace(
218 u".", u"").replace(
219 u"'", u".")
220 else:
221 string = string.replace(
222 u"'", u"").replace(
223 u".", u"").replace(
224 u",", u".")
226 elif stops and quotes:
227 string = normalize_two(u".", u"'", string)
229 elif commas and quotes:
230 string = normalize_two(u",", u"'", string)
232 elif commas and stops:
233 string = normalize_two(u",", u".", string)
235 elif commas:
236 if string[-4:-3] == u"," and len(string) <= 7:
237 # Single comma as a thousands separator.
238 string = string.replace(u",", u"")
239 else:
240 # Single comma, not thousands - probably a decimal point.
241 string = string.replace(u",", u".")
243 elif quotes:
244 # Single quote, probably MM'SS", equivalent to a decimal point.
245 string = string.replace(u"'", u".")
247 elif stops and string[-4:] == ".000":
248 # Single stop, but no decimal - probably grouping.
249 string = string.replace(u".", u"")
251 return string