+++ /dev/null
-import os
-
-import collate.errors
-import collate._abcollator
-import collate._constants
-
-DIRNAME = os.path.dirname(__file__)
-
-class Trie(object):
- def __init__(self):
- self.root = [None, {}]
-
- def add(self, key, value):
- curr_node = self.root
- for part in key:
- curr_node = curr_node[1].setdefault(part, [None, {}])
- curr_node[0] = value
-
- def find_prefix(self, key):
- curr_node = self.root
- remainder = key
- for part in key:
- if part not in curr_node[1]:
- break
- curr_node = curr_node[1][part]
- remainder = remainder[1:]
- return (curr_node[0], remainder)
-
-def load_trie(fileobj, trie):
- for line in fileobj:
- if line.startswith("#") or line.startswith("%"):
- continue
- if line.strip() == "":
- continue
- line = line[:line.find("#")] + "\n"
- line = line[:line.find("%")] + "\n"
- line = line.strip()
-
- if line.startswith("@"):
- pass
- else:
- semicolon = line.find(";")
- charList = line[:semicolon].strip().split()
- x = line[semicolon:]
- collElements = []
- while True:
- begin = x.find("[")
- if begin == -1:
- break
- end = x[begin:].find("]")
- collElement = x[begin:begin+end+1]
- x = x[begin + 1:]
-
- alt = collElement[1]
- chars = collElement[2:-1].split(".")
- chars = [int(_, 16) for _ in chars]
-
- collElements.append((alt, chars))
- integer_points = [int(ch, 16) for ch in charList]
- trie.add(integer_points, collElements)
-
-class Collator(collate._abcollator.Collator):
-
- def __init__(self, locale, encoding=None):
- self.locale, self.encoding = collate._locale.getpair(locale, encoding)
- if self.locale == "C":
- self.__table = _DUCET
- else:
- self.__table = Trie()
- filename = os.path.join(DIRNAME, locale.lower() + ".txt")
- try:
- fileobj = open(filename, "rU")
- except EnvironmentError:
- raise collate.errors.InvalidLocaleError(self.locale)
- else:
- load_trie(fileobj, self.__table)
-
- def __implicit_weight(self, cp):
- # UCA 7.1.3.
- if (0x4E00 <= cp <= 0x9FCB
- or (cp in [0xFA0E, 0xFA0F, 0xFA11, 0xFA13, 0xFA14,
- 0xFA1F, 0xFA21, 0xFA23, 0XFA24, 0XFA27,
- 0xFA28, 0xFA29])):
- base = 0xFB40
- elif (0x3400 <= cp <= 0x4DB5
- or 0x20000 <= cp <= 0x2A6D6
- or 0x2A700 <= cp <= 0x2B734):
- base = 0xFB80
- else:
- base = 0xFBC0
-
- aaaa = base + (cp >> 15)
- bbbb = (cp & 0x7FFF) | 0x8000
- # FIXME(jfw): Reread standard to make sure the 4th element is
- # right.
- return [('.', [aaaa, 0x20, 0x2, 0x2]),
- ('.', [bbbb, 0x0, 0x0, 0x0])]
-
- def key(self, string):
-
- collation_elements = []
-
- lookup_key = [ord(ch) for ch in string]
- while lookup_key:
- value, lookup_key = self.__table.find_prefix(lookup_key)
- if value is None:
- value = self.__implicit_weight(lookup_key.pop(0))
- collation_elements.extend(value)
-
- sort_key = []
-
- for level in range(4):
- if level:
- sort_key.append(0) # level separator
- for element in collation_elements:
- ce_l = element[1][level]
- if ce_l:
- sort_key.append(ce_l)
-
- return tuple(sort_key)
-
-try:
- fileobj = file(os.path.join(DIRNAME, "allkeys.txt"), "rU")
-except EnvironmentError:
- raise ImportError("no DUCET information available")
-else:
- _DUCET = Trie()
- load_trie(fileobj, _DUCET)
- fileobj.close()
- del(fileobj)