Remove NAME, just use __name__.
[python-collate.git] / collate / uca / __init__.py
1 import os
2
3 import collate.errors
4 import collate._abcollator
5 import collate._constants
6
7 class Trie(object):
8
9 def __init__(self):
10 self.root = [None, {}]
11
12 def add(self, key, value):
13 curr_node = self.root
14 for part in key:
15 curr_node = curr_node[1].setdefault(part, [None, {}])
16 curr_node[0] = value
17
18 def find_prefix(self, key):
19 curr_node = self.root
20 remainder = key
21 for part in key:
22 if part not in curr_node[1]:
23 break
24 curr_node = curr_node[1][part]
25 remainder = remainder[1:]
26 return (curr_node[0], remainder)
27
28
29 class Collator(collate._abcollator.Collator):
30
31 def __init__(self, locale_code, encoding=None):
32
33 self.__table = Trie()
34 self.locale = locale_code
35 dirname = os.path.dirname(__file__)
36 locale_code = locale_code.split(".")[0].lower()
37 short_code = locale_code.split("_")[0]
38 filenames = [os.path.join(dirname, locale_code + ".txt"),
39 os.path.join(dirname, short_code + ".txt"),
40 os.path.join(dirname, "allkeys.txt")]
41 for filename in filenames:
42 try:
43 fileobj = open(filename, "rU")
44 except EnvironmentError:
45 pass
46 else:
47 self.__load(fileobj)
48 break
49 else:
50 raise collate.errors.InvalidLocaleError(locale_code)
51
52 def __load(self, fileobj):
53 for line in fileobj:
54 if line.startswith("#") or line.startswith("%"):
55 continue
56 if line.strip() == "":
57 continue
58 line = line[:line.find("#")] + "\n"
59 line = line[:line.find("%")] + "\n"
60 line = line.strip()
61
62 if line.startswith("@"):
63 pass
64 else:
65 semicolon = line.find(";")
66 charList = line[:semicolon].strip().split()
67 x = line[semicolon:]
68 collElements = []
69 while True:
70 begin = x.find("[")
71 if begin == -1:
72 break
73 end = x[begin:].find("]")
74 collElement = x[begin:begin+end+1]
75 x = x[begin + 1:]
76
77 alt = collElement[1]
78 chars = collElement[2:-1].split(".")
79
80 collElements.append((alt, chars))
81 integer_points = [int(ch, 16) for ch in charList]
82 self.__table.add(integer_points, collElements)
83
84 def __implicit_weight(self, cp):
85 # UCA 7.1.3.
86 if (0x4E00 <= cp <= 0x9FCB
87 or (cp in [0xFA0E, 0xFA0F, 0xFA11, 0xFA13, 0xFA14,
88 0xFA1F, 0xFA21, 0xFA23, 0XFA24, 0XFA27,
89 0xFA28, 0xFA29])):
90 base = 0xFB40
91 elif (0x3400 <= cp <= 0x4DB5
92 or 0x20000 <= cp <= 0x2A6D6
93 or 0x2A700 <= cp <= 0x2B734):
94 base = 0xFB80
95 else:
96 base = 0xFBC0
97
98 aaaa = base + (cp >> 15)
99 bbbb = (cp & 0x7FFF) | 0x8000
100 # FIXME(jfw): Reread standard to make sure the 4th element is
101 # right.
102 return [('.', ["%04X" % aaaa, "0020", "0002", "0002"]),
103 ('.', ["%04X" % bbbb, "0000", "0000", "0000"])]
104
105 def key(self, string):
106
107 collation_elements = []
108
109 lookup_key = [ord(ch) for ch in string]
110 while lookup_key:
111 value, lookup_key = self.__table.find_prefix(lookup_key)
112 if value is None:
113 value = self.__implicit_weight(lookup_key.pop(0))
114 collation_elements.extend(value)
115
116 sort_key = []
117
118 for level in range(4):
119 if level:
120 sort_key.append(0) # level separator
121 for element in collation_elements:
122 ce_l = int(element[1][level], 16)
123 if ce_l:
124 sort_key.append(ce_l)
125
126 return tuple(sort_key)