Raw codepoint test backend. Tweaks to constructor arguments. Try to handle the case...
[python-collate.git] / collate / uca / __init__.py
1 import os
2
3 import collate.errors
4 import collate._abcollator
5 import collate._constants
6
7 NAME = "Python UCA %g" % collate._constants.VERSION
8
9 class Trie(object):
10
11 def __init__(self):
12 self.root = [None, {}]
13
14 def add(self, key, value):
15 curr_node = self.root
16 for part in key:
17 curr_node = curr_node[1].setdefault(part, [None, {}])
18 curr_node[0] = value
19
20 def find_prefix(self, key):
21 curr_node = self.root
22 remainder = key
23 for part in key:
24 if part not in curr_node[1]:
25 break
26 curr_node = curr_node[1][part]
27 remainder = remainder[1:]
28 return (curr_node[0], remainder)
29
30
31 class Collator(collate._abcollator.Collator):
32
33 def __init__(self, locale_code, encoding=None):
34
35 self.__table = Trie()
36 self.locale = locale_code
37 dirname = os.path.dirname(__file__)
38 locale_code = locale_code.split(".")[0].lower()
39 short_code = locale_code.split("_")[0]
40 filenames = [os.path.join(dirname, locale_code + ".txt"),
41 os.path.join(dirname, short_code + ".txt"),
42 os.path.join(dirname, "allkeys.txt")]
43 for filename in filenames:
44 try:
45 fileobj = open(filename, "rU")
46 except EnvironmentError:
47 pass
48 else:
49 self.__load(fileobj)
50 break
51 else:
52 raise collate.errors.InvalidLocaleError(locale_code)
53
54 def __load(self, fileobj):
55 for line in fileobj:
56 if line.startswith("#") or line.startswith("%"):
57 continue
58 if line.strip() == "":
59 continue
60 line = line[:line.find("#")] + "\n"
61 line = line[:line.find("%")] + "\n"
62 line = line.strip()
63
64 if line.startswith("@"):
65 pass
66 else:
67 semicolon = line.find(";")
68 charList = line[:semicolon].strip().split()
69 x = line[semicolon:]
70 collElements = []
71 while True:
72 begin = x.find("[")
73 if begin == -1:
74 break
75 end = x[begin:].find("]")
76 collElement = x[begin:begin+end+1]
77 x = x[begin + 1:]
78
79 alt = collElement[1]
80 chars = collElement[2:-1].split(".")
81
82 collElements.append((alt, chars))
83 integer_points = [int(ch, 16) for ch in charList]
84 self.__table.add(integer_points, collElements)
85
86 def __implicit_weight(self, cp):
87 # UCA 7.1.3.
88 if (0x4E00 <= cp <= 0x9FCB
89 or (cp in [0xFA0E, 0xFA0F, 0xFA11, 0xFA13, 0xFA14,
90 0xFA1F, 0xFA21, 0xFA23, 0XFA24, 0XFA27,
91 0xFA28, 0xFA29])):
92 base = 0xFB40
93 elif (0x3400 <= cp <= 0x4DB5
94 or 0x20000 <= cp <= 0x2A6D6
95 or 0x2A700 <= cp <= 0x2B734):
96 base = 0xFB80
97 else:
98 base = 0xFBC0
99
100 aaaa = base + (cp >> 15)
101 bbbb = (cp & 0x7FFF) | 0x8000
102 # FIXME(jfw): Reread standard to make sure the 4th element is
103 # right.
104 return [('.', ["%04X" % aaaa, "0020", "0002", "0002"]),
105 ('.', ["%04X" % bbbb, "0000", "0000", "0000"])]
106
107 def key(self, string):
108
109 collation_elements = []
110
111 lookup_key = [ord(ch) for ch in string]
112 while lookup_key:
113 value, lookup_key = self.__table.find_prefix(lookup_key)
114 if value is None:
115 value = self.__implicit_weight(lookup_key.pop(0))
116 collation_elements.extend(value)
117
118 sort_key = []
119
120 for level in range(4):
121 if level:
122 sort_key.append(0) # level separator
123 for element in collation_elements:
124 ce_l = int(element[1][level], 16)
125 if ce_l:
126 sort_key.append(ce_l)
127
128 return tuple(sort_key)