1 package net.sf.saxon.codenorm; 2 3 import net.sf.saxon.om.FastStringBuffer; 4 5 import java.io.*; 6 import java.util.ArrayList ; 7 import java.util.Iterator ; 8 import java.util.List ; 9 10 29 class UnicodeDataGenerator { 30 static final String copyright = "Copyright © 1998-1999 Unicode, Inc."; 31 32 35 36 private static final boolean DEBUG = false; 37 38 41 private static String dir; 43 44 private static String UNICODE_DATA = "UnicodeData.txt"; 45 private static String COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt"; 46 47 private static List canonicalClassKeys = new ArrayList (30000); 48 private static List canonicalClassValues = new ArrayList (30000); 49 50 private static List decompositionKeys = new ArrayList (6000); 51 private static List decompositionValues = new ArrayList (6000); 52 53 private static List exclusionList = new ArrayList (200); 54 private static List compatibilityList = new ArrayList (8000); 55 56 private UnicodeDataGenerator() { 57 } 58 59 62 63 static void build() { 64 try { 65 readExclusionList(); 66 buildDecompositionTables(); 67 } catch (java.io.IOException e) { 68 System.err.println("Can't load data file." + e + ", " + e.getMessage()); 69 } 70 } 71 72 76 79 80 82 private static void readExclusionList() throws java.io.IOException { 83 if (DEBUG) System.out.println("Reading Exclusions"); 84 BufferedReader in = new BufferedReader(new FileReader(dir + '/' + COMPOSITION_EXCLUSIONS), 5*1024); 85 while (true) { 86 87 89 String line = in.readLine(); 90 if (line == null) break; 91 int comment = line.indexOf('#'); if (comment != -1) line = line.substring(0,comment); 93 if (line.length() == 0) continue; 95 97 int z = line.indexOf(' '); 98 if (z < 0) { 99 z = line.length(); 100 } 101 int value = Integer.parseInt(line.substring(0,z),16); 102 exclusionList.add(new Integer (value)); 103 104 } 105 in.close(); 106 } 107 108 111 private static void buildDecompositionTables() 112 throws java.io.IOException { 113 if (DEBUG) System.out.println("Reading Unicode Character Database"); 114 BufferedReader in = new BufferedReader(new FileReader(dir + '/' + UNICODE_DATA), 64*1024); 115 int value; 116 int counter = 0; 117 while (true) { 118 119 121 String line = in.readLine(); 122 if (line == null) break; 123 int comment = line.indexOf('#'); if (comment != -1) line = line.substring(0,comment); 125 if (line.length() == 0) continue; 126 if (DEBUG) { 127 counter++; 128 if ((counter & 0xFF) == 0) System.out.println("At: " + line); 129 } 130 131 134 int start = 0; 135 int end = line.indexOf(';'); try { 137 value = Integer.parseInt(line.substring(start,end),16); 138 } catch (NumberFormatException e) { 139 throw new IllegalStateException ("Bad hex value in line:\n" + line); 140 } 141 if (true && value == '\u00c0') { 142 System.out.println("debug: " + line); 143 } 144 end = line.indexOf(';', end+1); end = line.indexOf(';', end+1); end = line.indexOf(';', start=end+1); 149 151 int cc = Integer.parseInt(line.substring(start,end)); 152 if (cc != (cc & 0xFF)) System.err.println("Bad canonical class at: " + line); 153 canonicalClassKeys.add(new Integer (value)); 154 canonicalClassValues.add(new Integer (cc)); 155 end = line.indexOf(';', end+1); end = line.indexOf(';', start=end+1); 159 163 if (start != end) { 164 String segment = line.substring(start, end); 165 boolean compat = segment.charAt(0) == '<'; 166 if (compat) { 167 compatibilityList.add(new Integer (value)); 168 } 170 String decomp = fromHex(segment); 171 172 174 if (decomp.length() < 1 || decomp.length() > 2 && !compat) { 175 System.err.println("Bad decomp at: " + line); 176 } 177 178 decompositionKeys.add(new Integer (value)); 179 decompositionValues.add(decomp); 180 182 185 } 204 } 205 in.close(); 206 if (DEBUG) System.out.println("Done reading Unicode Character Database"); 207 208 212 } 231 232 235 241 244 245 247 public static String fromHex(String source) { 248 FastStringBuffer result = new FastStringBuffer(5); 249 for (int i = 0; i < source.length(); ++i) { 250 char c = source.charAt(i); 251 switch (c) { 252 case ' ': break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': 254 case '8': case '9': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 255 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 256 int z = source.indexOf(' ',i); 257 if (z < 0) { 258 z = source.length(); 259 } 260 try { 261 result.append((char)Integer.parseInt(source.substring(i, z),16)); 262 } catch (NumberFormatException e) { 263 throw new IllegalArgumentException ("Bad hex value in " + source); 264 } 265 i = z; break; 267 case '<': int j = source.indexOf('>',i); if (j > 0) { 269 i = j; 270 break; 271 } default: 273 throw new IllegalArgumentException ("Bad hex value in " + source); 274 } 275 } 276 return result.toString(); 277 } 278 279 282 public static String hex(char i) { 283 String result = Integer.toString(i, 16).toUpperCase(); 284 return "0000".substring(result.length(),4) + result; 285 } 286 287 290 public static String hex(String s, String sep) { 291 FastStringBuffer result = new FastStringBuffer(20); 292 for (int i = 0; i < s.length(); ++i) { 293 if (i != 0) result.append(sep); 294 result.append(hex(s.charAt(i))); 295 } 296 return result.toString(); 297 } 298 299 302 303 private static void generateJava(PrintStream o) { 304 o.println("package net.sf.saxon.codenorm;"); 305 o.println(""); 306 o.println("//This module was generated by running net.sf.saxon.codenorm.UnicodeDataGenerator"); 307 o.println("//*** DO NOT EDIT! ***"); 308 o.println("//The strange format of this file is carefully chosen to avoid breaking Java compiler limits"); 309 o.println(""); 310 o.println("public class UnicodeData {"); 311 312 o.println("public static final String[] canonicalClassKeys = {"); 314 printArray(o, canonicalClassKeys.iterator()); 315 o.println("};"); 316 o.println("public static final String[] canonicalClassValues = {"); 317 printArray(o, canonicalClassValues.iterator()); 318 o.println("};"); 319 320 o.println("public static final String[] decompositionKeys = {"); 322 printArray(o, decompositionKeys.iterator()); 323 o.println("};"); 324 o.println("public static final String[] decompositionValues = {"); 325 printStringArray(o, decompositionValues.iterator()); 326 o.println("};"); 327 328 o.println("public static final String[] exclusionList = {"); 330 printArray(o, exclusionList.iterator()); 331 o.println("};"); 332 333 o.println("public static final String[] compatibilityList = {"); 335 printArray(o, compatibilityList.iterator()); 336 o.println("};"); 337 338 o.println("}"); 339 340 } 341 342 345 346 private static void printArray(PrintStream o, Iterator iter) { 347 int count = 0; 348 FastStringBuffer buff = new FastStringBuffer(120); 349 if (!iter.hasNext()) return; 350 buff.append('"'); 351 while (true) { 352 if (++count == 20) { 353 count = 0; 354 buff.append("\","); 355 o.println(buff.toString()); 356 buff.setLength(0); 357 buff.append('"'); 358 } 359 int next = ((Integer )iter.next()).intValue(); 360 buff.append(Integer.toString(next, 32)); if (iter.hasNext()) { 362 buff.append(","); 363 } else { 364 buff.append("\""); 365 o.println(buff.toString()); 366 return; 367 } 368 } 369 } 370 371 374 375 private static void printStringArray(PrintStream o, Iterator iter) { 376 int count = 0; 377 FastStringBuffer buff = new FastStringBuffer(120); 378 if (!iter.hasNext()) return; 379 while (true) { 380 if (++count == 20) { 381 count = 0; 382 o.println(buff.toString()); 383 buff.setLength(0); 384 } 385 String next = (String )iter.next(); 386 appendJavaString(next, buff); 387 if (iter.hasNext()) { 388 buff.append(", "); 389 } else { 390 o.println(buff.toString()); 391 return; 392 } 393 } 394 } 395 396 private static void appendJavaString(String value, FastStringBuffer buff) { 397 buff.append('"'); 398 for (int i=0; i<value.length(); i++) { 399 char c = value.charAt(i); 400 if (c == '\\') { 401 buff.append("\\\\"); 402 } else if (c == '"') { 403 buff.append("\\\""); 404 } else if (c > 32 && c < 127) { 405 buff.append(c); 406 } else { 407 buff.append("\\u"); 408 char b0 = "0123456789abcdef".charAt(c & 0xf); 409 char b1 = "0123456789abcdef".charAt((c>>4) & 0xf); 410 char b2 = "0123456789abcdef".charAt((c>>8) & 0xf); 411 char b3 = "0123456789abcdef".charAt((c>>12) & 0xf); 412 buff.append(b3); 413 buff.append(b2); 414 buff.append(b1); 415 buff.append(b0); 416 } 417 } 418 buff.append('"'); 419 } 420 421 430 431 public static void main(String [] args) throws Exception { 432 if (args.length != 2) { 433 System.err.println("Usage: java UnicodeDataGenerator dir UnicodeData.java"); 434 System.err.println("where dir is the directory containing the files UnicodeData.text and" + 435 " CompositionExclusions.txt from the Unicode character database"); 436 } 437 dir = args[0]; 438 build(); 439 PrintStream o = new PrintStream(new FileOutputStream(new File(args[1]))); 440 generateJava(o); 441 } 442 } 443 | Popular Tags |