uca: Share DUCET data; ensure it exists at import time.
authorJoe Wreschnig <joe.wreschnig@gmail.com>
Mon, 15 Feb 2010 05:39:19 +0000 (21:39 -0800)
committerJoe Wreschnig <joe.wreschnig@gmail.com>
Mon, 15 Feb 2010 05:39:19 +0000 (21:39 -0800)
collate/uca/__init__.py

index d40319a..21a819a 100644 (file)
@@ -4,8 +4,9 @@ import collate.errors
 import collate._abcollator
 import collate._constants
 
-class Trie(object):
+DIRNAME = os.path.dirname(__file__)
 
+class Trie(object):
     def __init__(self):
         self.root = [None, {}]
 
@@ -25,61 +26,54 @@ class Trie(object):
             remainder = remainder[1:]
         return (curr_node[0], remainder)
 
+def load_trie(fileobj, trie):
+    for line in fileobj:
+        if line.startswith("#") or line.startswith("%"):
+            continue
+        if line.strip() == "":
+            continue
+        line = line[:line.find("#")] + "\n"
+        line = line[:line.find("%")] + "\n"
+        line = line.strip()
 
-class Collator(collate._abcollator.Collator):
+        if line.startswith("@"):
+            pass
+        else:
+            semicolon = line.find(";")
+            charList = line[:semicolon].strip().split()
+            x = line[semicolon:]
+            collElements = []
+            while True:
+                begin = x.find("[")
+                if begin == -1:
+                    break                
+                end = x[begin:].find("]")
+                collElement = x[begin:begin+end+1]
+                x = x[begin + 1:]
 
-    def __init__(self, locale_code, encoding=None):
+                alt = collElement[1]
+                chars = collElement[2:-1].split(".")
+                chars = [int(_, 16) for _ in chars]
 
-        self.__table = Trie()
-        self.locale = locale_code
-        dirname = os.path.dirname(__file__)
-        locale_code = locale_code.split(".")[0].lower()
-        short_code = locale_code.split("_")[0]
-        filenames = [os.path.join(dirname, locale_code + ".txt"),
-                     os.path.join(dirname, short_code + ".txt"),
-                     os.path.join(dirname, "allkeys.txt")]
-        for filename in filenames:
+                collElements.append((alt, chars))
+            integer_points = [int(ch, 16) for ch in charList]
+            trie.add(integer_points, collElements)
+
+class Collator(collate._abcollator.Collator):
+
+    def __init__(self, locale, encoding=None):
+        self.locale, self.encoding = collate._locale.getpair(locale, encoding)
+        if self.locale == "C":
+            self.__table = _DUCET
+        else:
+            self.__table = Trie()
+            filename = os.path.join(DIRNAME, locale.lower() + ".txt")
             try:
                 fileobj = open(filename, "rU")
             except EnvironmentError:
-                pass
+                raise collate.errors.InvalidLocaleError(self.locale)
             else:
-                self.__load(fileobj)
-                break
-        else:
-            raise collate.errors.InvalidLocaleError(locale_code)
-
-    def __load(self, fileobj):
-        for line in fileobj:
-            if line.startswith("#") or line.startswith("%"):
-                continue
-            if line.strip() == "":
-                continue
-            line = line[:line.find("#")] + "\n"
-            line = line[:line.find("%")] + "\n"
-            line = line.strip()
-
-            if line.startswith("@"):
-                pass
-            else:
-                semicolon = line.find(";")
-                charList = line[:semicolon].strip().split()
-                x = line[semicolon:]
-                collElements = []
-                while True:
-                    begin = x.find("[")
-                    if begin == -1:
-                        break                
-                    end = x[begin:].find("]")
-                    collElement = x[begin:begin+end+1]
-                    x = x[begin + 1:]
-    
-                    alt = collElement[1]
-                    chars = collElement[2:-1].split(".")
-                    
-                    collElements.append((alt, chars))
-                integer_points = [int(ch, 16) for ch in charList]
-                self.__table.add(integer_points, collElements)
+                load_trie(fileobj, self.__table)
 
     def __implicit_weight(self, cp):
         # UCA 7.1.3.
@@ -99,8 +93,8 @@ class Collator(collate._abcollator.Collator):
         bbbb = (cp & 0x7FFF) | 0x8000
         # FIXME(jfw): Reread standard to make sure the 4th element is
         # right.
-        return [('.', ["%04X" % aaaa, "0020", "0002", "0002"]),
-                ('.', ["%04X" % bbbb, "0000", "0000", "0000"])]
+        return [('.', [aaaa, 0x20, 0x2, 0x2]),
+                ('.', [bbbb, 0x0, 0x0, 0x0])]
 
     def key(self, string):
         
@@ -119,8 +113,18 @@ class Collator(collate._abcollator.Collator):
             if level:
                 sort_key.append(0) # level separator
             for element in collation_elements:
-                ce_l = int(element[1][level], 16)
+                ce_l = element[1][level]
                 if ce_l:
                     sort_key.append(ce_l)
         
         return tuple(sort_key)
+
+try:
+    fileobj = file(os.path.join(DIRNAME, "allkeys.txt"), "rU")
+except EnvironmentError:
+    raise ImportError("no DUCET information available")
+else:
+    _DUCET = Trie()
+    load_trie(fileobj, _DUCET)
+    fileobj.close()
+    del(fileobj)