'Advanced' sorteme functions.
[python-collate.git] / collate / _strings.py
1 import unicodedata
2
3 def alnumsplit(string):
4 string = unicode(string)
5 strings = []
6 word = []
7 numeric = None
8 for char in string:
9 if numeric is None:
10 broke = False
11 if char.isnumeric():
12 numeric = True
13 elif char.isalpha():
14 numeric = False
15 elif numeric and char.isalpha():
16 broke = True
17 elif not numeric and char.isnumeric():
18 broke = True
19 if broke:
20 if word:
21 strings.append(u"".join(word))
22 word = []
23 numeric = None
24 word.append(char)
25 if word:
26 strings.append(u"".join(word))
27 return strings
28
29 def wordlike(string):
30 """Check if a string is 'word-like'.
31
32 Word-like strings contain at least one alphanumeric character.
33 """
34
35 # Explicit loop is faster than:
36 #return any(map(type(string).isalnum, string))
37
38 for c in string:
39 if c.isalnum():
40 return True
41 else:
42 return False
43
44 def numeric(orig, invalid=float('inf')):
45 if not orig:
46 return (invalid, '')
47 string = unicode(orig)
48 for c in string:
49 if c.isnumeric():
50 break
51 else:
52 return (invalid, orig)
53
54 mult = 1
55 while string[:1] == u"-" or string[:1] == u"+":
56 if string[:1] == u"-":
57 mult = -mult
58 string = string[1:]
59
60 # Early out if possible.
61 try:
62 return (float(string) * mult, orig)
63 except ValueError:
64 pass
65
66 # Otherwise we need to do this the hard way.
67 string = normalize_punc(string)
68
69 def _numeric(string):
70 total = 0
71 for c in string:
72 v = unicodedata.numeric(c)
73 if v >= 1:
74 total *= 10
75 total += v
76 return total
77
78 try:
79 whole, frac = string.split(".")
80 whole = _numeric(whole)
81 frac = _numeric(frac) / (10.0 ** len(frac))
82 return (mult * (whole + frac), orig)
83 except ValueError:
84 return (mult * _numeric(string), orig)
85
86 def normalize_punc(string):
87 string = unicode(string.strip(u",.'"))
88 string = filter(lambda u: u.isnumeric() or u in u",.'", string)
89 commas = string.count(u",")
90 stops = string.count(u".")
91 quotes = string.count(u"'")
92
93 # If anything occurs more than once, it's a separator.
94 if commas > 1:
95 string = string.replace(u",", u"")
96 commas = 0
97 if stops > 1:
98 string = string.replace(u".", u"")
99 stops = 0
100 if quotes > 1:
101 string = string.replace(u"'", u"")
102 quotes = 0
103
104 def normalize_two(a, b, string):
105 # One of each - assume the first is grouping, second is point.
106 a_idx = string.rindex(a)
107 b_idx = string.rindex(b)
108 if a_idx > b_idx:
109 string = string.replace(b, u"").replace(a, u".")
110 else:
111 string = string.replace(a, u"").replace(b, u".")
112 return string
113
114 if commas and stops and quotes:
115 # If all three, assume the middle is the decimal point.
116 # A,AAA.BB'CC
117 # A.AAA,BB'CC
118 # A,AAA'BB.CC
119 # A.AAA'BB,CC
120 # Not really valid, so do whatever we want...
121 # A'AAA.BB,CC
122 # A'AAA,BB.CC
123 comma_idx = string.index(u",")
124 stops_idx = string.index(u".")
125 quotes_idx = string.index(u"'")
126 if (comma_idx < stops_idx < quotes_idx
127 or quotes_idx < stops_idx < comma_idx):
128 string = string.replace(u",", u"").replace(u"'", u"")
129 elif (comma_idx < quotes_idx < stops_idx
130 or stops_idx < quotes_idx < comma_idx):
131 string = string.replace(
132 u",", u"").replace(
133 u".", u"").replace(
134 u"'", u".")
135 else:
136 string = string.replace(
137 u"'", u"").replace(
138 u".", u"").replace(
139 u",", u".")
140
141 elif stops and quotes:
142 string = normalize_two(u".", u"'", string)
143
144 elif commas and quotes:
145 string = normalize_two(u",", u"'", string)
146
147 elif commas and stops:
148 string = normalize_two(u",", u".", string)
149
150 elif commas:
151 if string[-4:-3] == u"," and len(string) <= 7:
152 # Single comma as a thousands separator.
153 string = string.replace(u",", u"")
154 else:
155 # Single comma, not thousands - probably a decimal point.
156 string = string.replace(u",", u".")
157
158 elif quotes:
159 # Single quote, probably MM'SS", equivalent to a decimal point.
160 string = string.replace(u"'", u".")
161
162 return string