1 57 58 package io; 59 60 import java.io.IOException ; 61 import java.io.InputStream ; 62 import java.io.InputStreamReader ; 63 import java.io.Reader ; 64 65 import com.sun.org.apache.xerces.internal.impl.io.UTF8Reader; 66 67 78 public class UTF8 { 79 80 84 85 public static void main(String [] argv) throws Exception { 86 87 final int BLOCK_READ_SIZE = 2048; 88 89 93 System.err.println("#"); 94 System.err.println("# Testing Java UTF-8 decoder"); 95 System.err.println("#"); 96 97 try { 99 InputStream stream = new UTF8Producer(); 100 Reader reader = new InputStreamReader (stream, "UTF8"); 101 long time = testCharByChar(reader); 102 System.err.println("PASS ("+time+" ms)"); 103 reader.close(); 104 } 105 catch (IOException e) { 106 System.err.println("FAIL: "+e.getMessage()); 107 } 108 109 try { 111 InputStream stream = new UTF8Producer(); 112 Reader reader = new InputStreamReader (stream, "UTF8"); 113 long time = testCharArray(reader, BLOCK_READ_SIZE); 114 System.err.println("PASS ("+time+" ms)"); 115 reader.close(); 116 } 117 catch (IOException e) { 118 System.err.println("FAIL: "+e.getMessage()); 119 } 120 121 125 System.err.println("#"); 126 System.err.println("# Testing custom UTF-8 decoder"); 127 System.err.println("#"); 128 129 try { 131 InputStream stream = new UTF8Producer(); 132 Reader reader = new UTF8Reader(stream); 133 long time = testCharByChar(reader); 134 System.err.println("PASS ("+time+" ms)"); 135 reader.close(); 136 } 137 catch (IOException e) { 138 System.err.println("FAIL: "+e.getMessage()); 139 } 140 141 try { 143 InputStream stream = new UTF8Producer(); 144 Reader reader = new UTF8Reader(stream); 145 long time = testCharArray(reader, BLOCK_READ_SIZE); 146 System.err.println("PASS ("+time+" ms)"); 147 reader.close(); 148 } 149 catch (IOException e) { 150 System.err.println("FAIL: "+e.getMessage()); 151 } 152 153 } 155 159 160 public static long testCharByChar(Reader reader) throws Exception { 161 162 long before = System.currentTimeMillis(); 163 System.err.println("# Testing character by character"); 164 165 System.err.println("testing 0x000000 -> 0x00007F"); 166 for (int i = 0; i < 0x0080; i++) { 167 int c = reader.read(); 168 if (c != i) { 169 expectedChar(null, i, c); 170 } 171 } 172 System.err.println("testing 0x000080 -> 0x0007FF"); 173 for (int i = 0x0080; i < 0x0800; i++) { 174 int c = reader.read(); 175 if (c != i) { 176 expectedChar(null, i, c); 177 } 178 } 179 System.err.println("testing 0x000800 -> 0x00D7FF"); 180 for (int i = 0x0800; i < 0xD800; i++) { 181 int c = reader.read(); 182 if (c != i) { 183 expectedChar(null, i, c); 184 } 185 } 186 System.err.println("testing 0x00E000 -> 0x00FFFF"); 187 for (int i = 0xE000; i < 0x010000; i++) { 188 int c = reader.read(); 189 if (c != i) { 190 expectedChar(null, i, c); 191 } 192 } 193 System.err.println("testing 0x010000 -> 0x110000"); 194 for (int i = 0x10000; i < 0x110000; i++) { 195 int uuuuu = (i >> 16) & 0x001F; 197 int wwww = uuuuu - 1; 198 int zzzz = (i >> 12) & 0x000F; 199 int yyyyyy = (i >> 6) & 0x003F; 200 int xxxxxx = i & 0x003F; 201 int hs = 0xD800 | (wwww << 6) | (zzzz << 2) | (yyyyyy >> 4); 202 int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx; 203 int c = reader.read(); 205 if (c != hs) { 206 expectedChar("high surrogate", hs, c); 207 } 208 c = reader.read(); 210 if (c != ls) { 211 expectedChar("low surrogate", ls, c); 212 } 213 } 214 System.err.println("checking EOF"); 215 int c = reader.read(); 216 if (c != -1) { 217 extraChar(c); 218 } 219 long after = System.currentTimeMillis(); 220 221 return after - before; 222 223 } 225 229 public static long testCharArray(Reader reader, int size) throws Exception { 230 231 long before = System.currentTimeMillis(); 232 System.err.println("# Testing character array of size "+size); 233 234 char[] ch = new char[size]; 235 int count = 0; 236 int position = 0; 237 238 System.err.println("testing 0x000000 -> 0x00007F"); 239 for (int i = 0; i < 0x0080; i++) { 240 if (position == count) { 241 count = load(reader, ch); 242 position = 0; 243 } 244 int c = ch[position++]; 245 if (c != i) { 246 expectedChar(null, i, c); 247 } 248 } 249 System.err.println("testing 0x000080 -> 0x0007FF"); 250 for (int i = 0x0080; i < 0x0800; i++) { 251 if (position == count) { 252 count = load(reader, ch); 253 position = 0; 254 } 255 int c = ch[position++]; 256 if (c != i) { 257 expectedChar(null, i, c); 258 } 259 } 260 System.err.println("testing 0x000800 -> 0x00D7FF"); 261 for (int i = 0x0800; i < 0xD800; i++) { 262 if (position == count) { 263 count = load(reader, ch); 264 position = 0; 265 } 266 int c = ch[position++]; 267 if (c != i) { 268 expectedChar(null, i, c); 269 } 270 } 271 System.err.println("testing 0x00E000 -> 0x00FFFF"); 272 for (int i = 0xE000; i < 0x010000; i++) { 273 if (position == count) { 274 count = load(reader, ch); 275 position = 0; 276 } 277 int c = ch[position++]; 278 if (c != i) { 279 expectedChar(null, i, c); 280 } 281 } 282 System.err.println("testing 0x010000 -> 0x110000"); 283 for (int i = 0x10000; i < 0x110000; i++) { 284 int uuuuu = (i >> 16) & 0x001F; 286 int wwww = uuuuu - 1; 287 int zzzz = (i >> 12) & 0x000F; 288 int yyyyyy = (i >> 6) & 0x003F; 289 int xxxxxx = i & 0x003F; 290 int hs = 0xD800 | (wwww << 6) | (zzzz << 2) | (yyyyyy >> 4); 291 int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx; 292 if (position == count) { 294 count = load(reader, ch); 295 position = 0; 296 } 297 int c = ch[position++]; 298 if (c != hs) { 299 expectedChar("high surrogate", hs, c); 300 } 301 if (position == count) { 303 count = load(reader, ch); 304 position = 0; 305 } 306 c = ch[position++]; 307 if (c != ls) { 308 expectedChar("low surrogate", ls, c); 309 } 310 } 311 System.err.println("checking EOF"); 312 if (position == count) { 313 count = load(reader, ch); 314 position = 0; 315 } 316 if (count != -1) { 317 extraChar(ch[position]); 318 } 319 long after = System.currentTimeMillis(); 320 321 return after - before; 322 323 } 325 329 330 private static int load(Reader reader, char[] ch) throws IOException { 331 int count = reader.read(ch, 0, ch.length); 332 return count; 333 } 335 336 private static void expectedChar(String prefix, int ec, int fc) throws IOException { 337 StringBuffer str = new StringBuffer (); 338 str.append("expected "); 339 if (prefix != null) { 340 str.append(prefix); 341 str.append(' '); 342 } 343 str.append("0x"); 344 str.append(Integer.toHexString(ec)); 345 str.append(" but found 0x"); 346 if (fc != -1) { 347 str.append(Integer.toHexString(fc)); 348 } 349 else { 350 str.append("EOF"); 351 } 352 String message = str.toString(); 353 throw new IOException (message); 354 } 356 357 private static void extraChar(int c) throws IOException { 358 StringBuffer str = new StringBuffer (); 359 str.append("found extra character 0x"); 360 str.append(Integer.toHexString(c)); 361 String message = str.toString(); 362 throw new IOException (message); 363 } 365 369 375 public static class UTF8Producer 376 extends InputStream { 377 378 382 383 private int fCodePoint; 384 385 386 private int fByte; 387 388 392 393 public int read() throws IOException { 394 395 if (fCodePoint < 0x0080) { 398 int b = fCodePoint; 399 fCodePoint++; 400 fByte = 0; 401 return b; 402 } 403 404 if (fCodePoint < 0x0800) { 407 switch (fByte) { 408 case 0: { 409 int b = 0x00C0 | ((fCodePoint >> 6) & 0x001F); 410 fByte++; 411 return b; 412 } 413 case 1: { 414 int b = 0x0080 | (fCodePoint & 0x003F); 415 fCodePoint++; 416 fByte = 0; 417 return b; 418 } 419 default: { 420 throw new RuntimeException ("byte "+fByte+" of 2 byte UTF-8 sequence"); 421 } 422 } 423 } 424 425 if (fCodePoint < 0x10000) { 428 switch (fByte) { 429 case 0: { 430 int b = 0x00E0 | ((fCodePoint >> 12) & 0x000F); 431 fByte++; 432 return b; 433 } 434 case 1: { 435 int b = 0x0080 | ((fCodePoint >> 6) & 0x003F); 436 fByte++; 437 return b; 438 } 439 case 2: { 440 int b = 0x0080 | (fCodePoint & 0x003F); 441 fCodePoint++; 442 if (fCodePoint == 0xD800) { 444 fCodePoint = 0xE000; 445 } 446 fByte = 0; 447 return b; 448 } 449 default: { 450 throw new RuntimeException ("byte "+fByte+" of 3 byte UTF-8 sequence"); 451 } 452 } 453 } 454 455 if (fCodePoint < 0x110000) { 461 switch (fByte) { 462 case 0: { 463 int uuuuu = (fCodePoint >> 16) & 0x001F; 464 int b = 0x00F0 | (uuuuu >> 2); 465 fByte++; 466 return b; 467 } 468 case 1: { 469 int uuuuu = (fCodePoint >> 16) & 0x001F; 470 int zzzz = (fCodePoint >> 12) & 0x000F; 471 int b = 0x0080 | ((uuuuu << 4) & 0x0030) | zzzz; 472 fByte++; 473 return b; 474 } 475 case 2: { 476 int yyyyyy = (fCodePoint >> 6) & 0x003F; 477 int b = 0x0080 | yyyyyy; 478 fByte++; 479 return b; 480 } 481 case 3: { 482 int xxxxxx = fCodePoint & 0x003F; 483 int b = 0x0080 | xxxxxx; 484 fCodePoint++; 485 fByte = 0; 486 return b; 487 } 488 default: { 489 throw new RuntimeException ("byte "+fByte+" of 4 byte UTF-8 sequence"); 490 } 491 } 492 } 493 494 return -1; 496 497 } 499 } 501 } | Popular Tags |