Category-based splitting.
[python-collate.git] / collate / _strings.py
1 import unicodedata
2
3 def strip_punc(string):
4 return filter(lambda c: unicodedata.category(c)[0] not in "PS", string)
5
6 def strip_ends(string):
7 while string and unicodedata.category(string[0])[0] in "ZPS":
8 string = string[1:]
9 while string and unicodedata.category(string[-1])[0] in "ZPS":
10 string = string[:-1]
11 return string
12
13 def alnumsplit(string):
14 if not string:
15 return []
16 string = unicode(string)
17 strings = []
18 numeric = None
19 start = 0
20 for i, char in enumerate(string):
21 category = unicodedata.category(char)
22 if numeric is None:
23 broke = False
24 if char.isnumeric():
25 numeric = True
26 elif char.isalpha():
27 numeric = False
28 elif numeric and char.isalpha():
29 broke = True
30 numeric = False
31 elif numeric and category in ["Zs", "Ps", "Pe"]:
32 broke = True
33 numeric = None
34 elif not numeric and char.isnumeric():
35 broke = True
36 numeric = True
37 if broke:
38 strings.append(strip_ends(string[start:i]))
39 start = i
40 broke = False
41 strings.append(strip_ends(string[start:i + 1]))
42 return strings
43
44 def wordlike(string):
45 """Check if a string is 'word-like'.
46
47 Word-like strings contain at least one alphanumeric character.
48 """
49
50 # Explicit loop is faster than:
51 #return any(map(type(string).isalnum, string))
52
53 for c in string:
54 if c.isalnum():
55 return True
56 else:
57 return False
58
59 def numeric(orig, invalid=float('inf')):
60 if not orig:
61 return (invalid, '')
62
63 string = unicode(orig)
64 for c in string:
65 if c.isnumeric():
66 break
67 else:
68 return (invalid, orig)
69
70 mult = 1
71 while string[:1] == u"-" or string[:1] == u"+":
72 if string[:1] == u"-":
73 mult = -mult
74 string = string[1:]
75
76 if not string[:1].isnumeric():
77 return (invalid, orig)
78
79 # Early out if possible.
80 try:
81 return (float(string) * mult, orig)
82 except ValueError:
83 pass
84
85 # Otherwise we need to do this the hard way.
86 string = normalize_punc(string)
87
88 def _numeric(string):
89 total = 0
90 for c in string:
91 v = unicodedata.numeric(c)
92 if v >= 1 or v == 0:
93 total *= 10
94 total += v
95 return total
96
97 try:
98 whole, frac = string.split(".")
99 whole = _numeric(whole)
100 frac = _numeric(frac) / (10.0 ** len(frac))
101 return (mult * (whole + frac), orig)
102 except ValueError:
103 return (mult * _numeric(string), orig)
104
105 def normalize_punc(string):
106 string = unicode(string.strip(u",.'"))
107 string = filter(lambda u: u.isnumeric() or u in u",.'", string)
108 commas = string.count(u",")
109 stops = string.count(u".")
110 quotes = string.count(u"'")
111
112 # If anything occurs more than once, it's a separator.
113 if commas > 1:
114 string = string.replace(u",", u"")
115 commas = 0
116 if stops > 1:
117 string = string.replace(u".", u"")
118 stops = 0
119 if quotes > 1:
120 string = string.replace(u"'", u"")
121 quotes = 0
122
123 def normalize_two(a, b, string):
124 # One of each - assume the first is grouping, second is point.
125 a_idx = string.rindex(a)
126 b_idx = string.rindex(b)
127 if a_idx > b_idx:
128 string = string.replace(b, u"").replace(a, u".")
129 else:
130 string = string.replace(a, u"").replace(b, u".")
131 return string
132
133 if commas and stops and quotes:
134 # If all three, assume the middle is the decimal point.
135 # A,AAA.BB'CC
136 # A.AAA,BB'CC
137 # A,AAA'BB.CC
138 # A.AAA'BB,CC
139 # Not really valid, so do whatever we want...
140 # A'AAA.BB,CC
141 # A'AAA,BB.CC
142 comma_idx = string.index(u",")
143 stops_idx = string.index(u".")
144 quotes_idx = string.index(u"'")
145 if (comma_idx < stops_idx < quotes_idx
146 or quotes_idx < stops_idx < comma_idx):
147 string = string.replace(u",", u"").replace(u"'", u"")
148 elif (comma_idx < quotes_idx < stops_idx
149 or stops_idx < quotes_idx < comma_idx):
150 string = string.replace(
151 u",", u"").replace(
152 u".", u"").replace(
153 u"'", u".")
154 else:
155 string = string.replace(
156 u"'", u"").replace(
157 u".", u"").replace(
158 u",", u".")
159
160 elif stops and quotes:
161 string = normalize_two(u".", u"'", string)
162
163 elif commas and quotes:
164 string = normalize_two(u",", u"'", string)
165
166 elif commas and stops:
167 string = normalize_two(u",", u".", string)
168
169 elif commas:
170 if string[-4:-3] == u"," and len(string) <= 7:
171 # Single comma as a thousands separator.
172 string = string.replace(u",", u"")
173 else:
174 # Single comma, not thousands - probably a decimal point.
175 string = string.replace(u",", u".")
176
177 elif quotes:
178 # Single quote, probably MM'SS", equivalent to a decimal point.
179 string = string.replace(u"'", u".")
180
181 return string