ba20b487bd1e4559f522c909f6cab161b68b2767
[python-collate.git] / collate / uca / __init__.py
1 import os
2
3 import collate.errors
4 import collate._abcollator
5
6 class Trie(object):
7
8 def __init__(self):
9 self.root = [None, {}]
10
11 def add(self, key, value):
12 curr_node = self.root
13 for part in key:
14 curr_node = curr_node[1].setdefault(part, [None, {}])
15 curr_node[0] = value
16
17 def find_prefix(self, key):
18 curr_node = self.root
19 remainder = key
20 for part in key:
21 if part not in curr_node[1]:
22 break
23 curr_node = curr_node[1][part]
24 remainder = remainder[1:]
25 return (curr_node[0], remainder)
26
27
28 class Collator(collate._abcollator.Collator):
29
30 def __init__(self, locale_code, strict=False):
31
32 self.__table = Trie()
33 self.locale = locale_code
34 dirname = os.path.dirname(__file__)
35 locale_code = locale_code.split(".")[0].lower()
36 short_code = locale_code.split("_")[0]
37 filenames = [os.path.join(dirname, locale_code + ".txt"),
38 os.path.join(dirname, short_code + ".txt"),
39 os.path.join(dirname, "allkeys.txt")]
40 for filename in filenames:
41 try:
42 fileobj = open(filename, "rU")
43 except EnvironmentError:
44 pass
45 else:
46 self.__load(fileobj)
47 break
48 else:
49 raise collate.errors.InvalidLocaleError(locale_code)
50
51 def __load(self, fileobj):
52 for line in fileobj:
53 if line.startswith("#") or line.startswith("%"):
54 continue
55 if line.strip() == "":
56 continue
57 line = line[:line.find("#")] + "\n"
58 line = line[:line.find("%")] + "\n"
59 line = line.strip()
60
61 if line.startswith("@"):
62 pass
63 else:
64 semicolon = line.find(";")
65 charList = line[:semicolon].strip().split()
66 x = line[semicolon:]
67 collElements = []
68 while True:
69 begin = x.find("[")
70 if begin == -1:
71 break
72 end = x[begin:].find("]")
73 collElement = x[begin:begin+end+1]
74 x = x[begin + 1:]
75
76 alt = collElement[1]
77 chars = collElement[2:-1].split(".")
78
79 collElements.append((alt, chars))
80 integer_points = [int(ch, 16) for ch in charList]
81 self.__table.add(integer_points, collElements)
82
83 def __implicit_weight(self, cp):
84 # UCA 7.1.3.
85 if (0x4E00 <= cp <= 0x9FCB
86 or (cp in [0xFA0E, 0xFA0F, 0xFA11, 0xFA13, 0xFA14,
87 0xFA1F, 0xFA21, 0xFA23, 0XFA24, 0XFA27,
88 0xFA28, 0xFA29])):
89 base = 0xFB40
90 elif (0x3400 <= cp <= 0x4DB5
91 or 0x20000 <= cp <= 0x2A6D6
92 or 0x2A700 <= cp <= 0x2B734):
93 base = 0xFB80
94 else:
95 base = 0xFBC0
96
97 aaaa = base + (cp >> 15)
98 bbbb = (cp & 0x7FFF) | 0x8000
99 # FIXME(jfw): Reread standard to make sure the 4th element is
100 # right.
101 return [('.', ["%04X" % aaaa, "0020", "0002", "0002"]),
102 ('.', ["%04X" % bbbb, "0000", "0000", "0000"])]
103
104 def key(self, string):
105
106 collation_elements = []
107
108 lookup_key = [ord(ch) for ch in string]
109 while lookup_key:
110 value, lookup_key = self.__table.find_prefix(lookup_key)
111 if value is None:
112 value = self.__implicit_weight(lookup_key.pop(0))
113 collation_elements.extend(value)
114
115 sort_key = []
116
117 for level in range(4):
118 if level:
119 sort_key.append(0) # level separator
120 for element in collation_elements:
121 ce_l = int(element[1][level], 16)
122 if ce_l:
123 sort_key.append(ce_l)
124
125 return tuple(sort_key)