X-Git-Url: https://git.korewanetadesu.com/?p=python-collate.git;a=blobdiff_plain;f=collate%2Fuca%2F__init__.py;fp=collate%2Fuca%2F__init__.py;h=0000000000000000000000000000000000000000;hp=21a819a09b3a414d43f66fe89e99c1531b41997a;hb=08e8f0a8bb8d7276d114087f8ff8dbce4acdb1cf;hpb=55e8d3e84b06336ab6c2d8b18958b5e3e6efa284 diff --git a/collate/uca/__init__.py b/collate/uca/__init__.py deleted file mode 100644 index 21a819a..0000000 --- a/collate/uca/__init__.py +++ /dev/null @@ -1,130 +0,0 @@ -import os - -import collate.errors -import collate._abcollator -import collate._constants - -DIRNAME = os.path.dirname(__file__) - -class Trie(object): - def __init__(self): - self.root = [None, {}] - - def add(self, key, value): - curr_node = self.root - for part in key: - curr_node = curr_node[1].setdefault(part, [None, {}]) - curr_node[0] = value - - def find_prefix(self, key): - curr_node = self.root - remainder = key - for part in key: - if part not in curr_node[1]: - break - curr_node = curr_node[1][part] - remainder = remainder[1:] - return (curr_node[0], remainder) - -def load_trie(fileobj, trie): - for line in fileobj: - if line.startswith("#") or line.startswith("%"): - continue - if line.strip() == "": - continue - line = line[:line.find("#")] + "\n" - line = line[:line.find("%")] + "\n" - line = line.strip() - - if line.startswith("@"): - pass - else: - semicolon = line.find(";") - charList = line[:semicolon].strip().split() - x = line[semicolon:] - collElements = [] - while True: - begin = x.find("[") - if begin == -1: - break - end = x[begin:].find("]") - collElement = x[begin:begin+end+1] - x = x[begin + 1:] - - alt = collElement[1] - chars = collElement[2:-1].split(".") - chars = [int(_, 16) for _ in chars] - - collElements.append((alt, chars)) - integer_points = [int(ch, 16) for ch in charList] - trie.add(integer_points, collElements) - -class Collator(collate._abcollator.Collator): - - def __init__(self, locale, encoding=None): - self.locale, self.encoding = collate._locale.getpair(locale, encoding) - if self.locale == "C": - self.__table = _DUCET - else: - self.__table = Trie() - filename = os.path.join(DIRNAME, locale.lower() + ".txt") - try: - fileobj = open(filename, "rU") - except EnvironmentError: - raise collate.errors.InvalidLocaleError(self.locale) - else: - load_trie(fileobj, self.__table) - - def __implicit_weight(self, cp): - # UCA 7.1.3. - if (0x4E00 <= cp <= 0x9FCB - or (cp in [0xFA0E, 0xFA0F, 0xFA11, 0xFA13, 0xFA14, - 0xFA1F, 0xFA21, 0xFA23, 0XFA24, 0XFA27, - 0xFA28, 0xFA29])): - base = 0xFB40 - elif (0x3400 <= cp <= 0x4DB5 - or 0x20000 <= cp <= 0x2A6D6 - or 0x2A700 <= cp <= 0x2B734): - base = 0xFB80 - else: - base = 0xFBC0 - - aaaa = base + (cp >> 15) - bbbb = (cp & 0x7FFF) | 0x8000 - # FIXME(jfw): Reread standard to make sure the 4th element is - # right. - return [('.', [aaaa, 0x20, 0x2, 0x2]), - ('.', [bbbb, 0x0, 0x0, 0x0])] - - def key(self, string): - - collation_elements = [] - - lookup_key = [ord(ch) for ch in string] - while lookup_key: - value, lookup_key = self.__table.find_prefix(lookup_key) - if value is None: - value = self.__implicit_weight(lookup_key.pop(0)) - collation_elements.extend(value) - - sort_key = [] - - for level in range(4): - if level: - sort_key.append(0) # level separator - for element in collation_elements: - ce_l = element[1][level] - if ce_l: - sort_key.append(ce_l) - - return tuple(sort_key) - -try: - fileobj = file(os.path.join(DIRNAME, "allkeys.txt"), "rU") -except EnvironmentError: - raise ImportError("no DUCET information available") -else: - _DUCET = Trie() - load_trie(fileobj, _DUCET) - fileobj.close() - del(fileobj)