uca: Share DUCET data; ensure it exists at import time.
[python-collate.git] / collate / uca / __init__.py
1 import os
2
3 import collate.errors
4 import collate._abcollator
5 import collate._constants
6
7 DIRNAME = os.path.dirname(__file__)
8
9 class Trie(object):
10 def __init__(self):
11 self.root = [None, {}]
12
13 def add(self, key, value):
14 curr_node = self.root
15 for part in key:
16 curr_node = curr_node[1].setdefault(part, [None, {}])
17 curr_node[0] = value
18
19 def find_prefix(self, key):
20 curr_node = self.root
21 remainder = key
22 for part in key:
23 if part not in curr_node[1]:
24 break
25 curr_node = curr_node[1][part]
26 remainder = remainder[1:]
27 return (curr_node[0], remainder)
28
29 def load_trie(fileobj, trie):
30 for line in fileobj:
31 if line.startswith("#") or line.startswith("%"):
32 continue
33 if line.strip() == "":
34 continue
35 line = line[:line.find("#")] + "\n"
36 line = line[:line.find("%")] + "\n"
37 line = line.strip()
38
39 if line.startswith("@"):
40 pass
41 else:
42 semicolon = line.find(";")
43 charList = line[:semicolon].strip().split()
44 x = line[semicolon:]
45 collElements = []
46 while True:
47 begin = x.find("[")
48 if begin == -1:
49 break
50 end = x[begin:].find("]")
51 collElement = x[begin:begin+end+1]
52 x = x[begin + 1:]
53
54 alt = collElement[1]
55 chars = collElement[2:-1].split(".")
56 chars = [int(_, 16) for _ in chars]
57
58 collElements.append((alt, chars))
59 integer_points = [int(ch, 16) for ch in charList]
60 trie.add(integer_points, collElements)
61
62 class Collator(collate._abcollator.Collator):
63
64 def __init__(self, locale, encoding=None):
65 self.locale, self.encoding = collate._locale.getpair(locale, encoding)
66 if self.locale == "C":
67 self.__table = _DUCET
68 else:
69 self.__table = Trie()
70 filename = os.path.join(DIRNAME, locale.lower() + ".txt")
71 try:
72 fileobj = open(filename, "rU")
73 except EnvironmentError:
74 raise collate.errors.InvalidLocaleError(self.locale)
75 else:
76 load_trie(fileobj, self.__table)
77
78 def __implicit_weight(self, cp):
79 # UCA 7.1.3.
80 if (0x4E00 <= cp <= 0x9FCB
81 or (cp in [0xFA0E, 0xFA0F, 0xFA11, 0xFA13, 0xFA14,
82 0xFA1F, 0xFA21, 0xFA23, 0XFA24, 0XFA27,
83 0xFA28, 0xFA29])):
84 base = 0xFB40
85 elif (0x3400 <= cp <= 0x4DB5
86 or 0x20000 <= cp <= 0x2A6D6
87 or 0x2A700 <= cp <= 0x2B734):
88 base = 0xFB80
89 else:
90 base = 0xFBC0
91
92 aaaa = base + (cp >> 15)
93 bbbb = (cp & 0x7FFF) | 0x8000
94 # FIXME(jfw): Reread standard to make sure the 4th element is
95 # right.
96 return [('.', [aaaa, 0x20, 0x2, 0x2]),
97 ('.', [bbbb, 0x0, 0x0, 0x0])]
98
99 def key(self, string):
100
101 collation_elements = []
102
103 lookup_key = [ord(ch) for ch in string]
104 while lookup_key:
105 value, lookup_key = self.__table.find_prefix(lookup_key)
106 if value is None:
107 value = self.__implicit_weight(lookup_key.pop(0))
108 collation_elements.extend(value)
109
110 sort_key = []
111
112 for level in range(4):
113 if level:
114 sort_key.append(0) # level separator
115 for element in collation_elements:
116 ce_l = element[1][level]
117 if ce_l:
118 sort_key.append(ce_l)
119
120 return tuple(sort_key)
121
122 try:
123 fileobj = file(os.path.join(DIRNAME, "allkeys.txt"), "rU")
124 except EnvironmentError:
125 raise ImportError("no DUCET information available")
126 else:
127 _DUCET = Trie()
128 load_trie(fileobj, _DUCET)
129 fileobj.close()
130 del(fileobj)