From e168dd08689113c094141be7e58bacddf5034fa8 Mon Sep 17 00:00:00 2001 From: Joe Wreschnig Date: Sun, 14 Feb 2010 21:39:19 -0800 Subject: [PATCH] uca: Share DUCET data; ensure it exists at import time. --- collate/uca/__init__.py | 108 +++++++++++++++++++++------------------- 1 file changed, 56 insertions(+), 52 deletions(-) diff --git a/collate/uca/__init__.py b/collate/uca/__init__.py index d40319a..21a819a 100644 --- a/collate/uca/__init__.py +++ b/collate/uca/__init__.py @@ -4,8 +4,9 @@ import collate.errors import collate._abcollator import collate._constants -class Trie(object): +DIRNAME = os.path.dirname(__file__) +class Trie(object): def __init__(self): self.root = [None, {}] @@ -25,61 +26,54 @@ class Trie(object): remainder = remainder[1:] return (curr_node[0], remainder) +def load_trie(fileobj, trie): + for line in fileobj: + if line.startswith("#") or line.startswith("%"): + continue + if line.strip() == "": + continue + line = line[:line.find("#")] + "\n" + line = line[:line.find("%")] + "\n" + line = line.strip() -class Collator(collate._abcollator.Collator): + if line.startswith("@"): + pass + else: + semicolon = line.find(";") + charList = line[:semicolon].strip().split() + x = line[semicolon:] + collElements = [] + while True: + begin = x.find("[") + if begin == -1: + break + end = x[begin:].find("]") + collElement = x[begin:begin+end+1] + x = x[begin + 1:] - def __init__(self, locale_code, encoding=None): + alt = collElement[1] + chars = collElement[2:-1].split(".") + chars = [int(_, 16) for _ in chars] - self.__table = Trie() - self.locale = locale_code - dirname = os.path.dirname(__file__) - locale_code = locale_code.split(".")[0].lower() - short_code = locale_code.split("_")[0] - filenames = [os.path.join(dirname, locale_code + ".txt"), - os.path.join(dirname, short_code + ".txt"), - os.path.join(dirname, "allkeys.txt")] - for filename in filenames: + collElements.append((alt, chars)) + integer_points = [int(ch, 16) for ch in charList] + trie.add(integer_points, collElements) + +class Collator(collate._abcollator.Collator): + + def __init__(self, locale, encoding=None): + self.locale, self.encoding = collate._locale.getpair(locale, encoding) + if self.locale == "C": + self.__table = _DUCET + else: + self.__table = Trie() + filename = os.path.join(DIRNAME, locale.lower() + ".txt") try: fileobj = open(filename, "rU") except EnvironmentError: - pass + raise collate.errors.InvalidLocaleError(self.locale) else: - self.__load(fileobj) - break - else: - raise collate.errors.InvalidLocaleError(locale_code) - - def __load(self, fileobj): - for line in fileobj: - if line.startswith("#") or line.startswith("%"): - continue - if line.strip() == "": - continue - line = line[:line.find("#")] + "\n" - line = line[:line.find("%")] + "\n" - line = line.strip() - - if line.startswith("@"): - pass - else: - semicolon = line.find(";") - charList = line[:semicolon].strip().split() - x = line[semicolon:] - collElements = [] - while True: - begin = x.find("[") - if begin == -1: - break - end = x[begin:].find("]") - collElement = x[begin:begin+end+1] - x = x[begin + 1:] - - alt = collElement[1] - chars = collElement[2:-1].split(".") - - collElements.append((alt, chars)) - integer_points = [int(ch, 16) for ch in charList] - self.__table.add(integer_points, collElements) + load_trie(fileobj, self.__table) def __implicit_weight(self, cp): # UCA 7.1.3. @@ -99,8 +93,8 @@ class Collator(collate._abcollator.Collator): bbbb = (cp & 0x7FFF) | 0x8000 # FIXME(jfw): Reread standard to make sure the 4th element is # right. - return [('.', ["%04X" % aaaa, "0020", "0002", "0002"]), - ('.', ["%04X" % bbbb, "0000", "0000", "0000"])] + return [('.', [aaaa, 0x20, 0x2, 0x2]), + ('.', [bbbb, 0x0, 0x0, 0x0])] def key(self, string): @@ -119,8 +113,18 @@ class Collator(collate._abcollator.Collator): if level: sort_key.append(0) # level separator for element in collation_elements: - ce_l = int(element[1][level], 16) + ce_l = element[1][level] if ce_l: sort_key.append(ce_l) return tuple(sort_key) + +try: + fileobj = file(os.path.join(DIRNAME, "allkeys.txt"), "rU") +except EnvironmentError: + raise ImportError("no DUCET information available") +else: + _DUCET = Trie() + load_trie(fileobj, _DUCET) + fileobj.close() + del(fileobj) -- 2.30.2