1 74 75 package org.eclipse.emf.ecore.xml.type.internal; 76 77 78 import java.text.CharacterIterator ; 79 import java.util.Hashtable ; 80 import java.util.Locale ; 81 import java.util.ResourceBundle ; 82 import java.util.Vector ; 83 84 import org.eclipse.emf.ecore.plugin.EcorePlugin; 85 86 89 public final class RegEx 90 { 91 static class BMPattern 92 { 93 char[] pattern; 94 95 int[] shiftTable; 96 97 boolean ignoreCase; 98 99 public BMPattern(String pat, boolean ignoreCase) 100 { 101 this(pat, 256, ignoreCase); 102 } 103 104 public BMPattern(String pat, int tableSize, boolean ignoreCase) 105 { 106 this.pattern = pat.toCharArray(); 107 this.shiftTable = new int [tableSize]; 108 this.ignoreCase = ignoreCase; 109 int length = pattern.length; 110 for (int i = 0; i < this.shiftTable.length; i++) 111 this.shiftTable[i] = length; 112 for (int i = 0; i < length; i++) 113 { 114 char ch = this.pattern[i]; 115 int diff = length - i - 1; 116 int index = ch % this.shiftTable.length; 117 if (diff < this.shiftTable[index]) 118 this.shiftTable[index] = diff; 119 if (this.ignoreCase) 120 { 121 ch = Character.toUpperCase(ch); 122 index = ch % this.shiftTable.length; 123 if (diff < this.shiftTable[index]) 124 this.shiftTable[index] = diff; 125 ch = Character.toLowerCase(ch); 126 index = ch % this.shiftTable.length; 127 if (diff < this.shiftTable[index]) 128 this.shiftTable[index] = diff; 129 } 130 } 131 } 132 133 137 public int matches(CharacterIterator iterator, int start, int limit) 138 { 139 if (this.ignoreCase) 140 return this.matchesIgnoreCase(iterator, start, limit); 141 int plength = this.pattern.length; 142 if (plength == 0) 143 return start; 144 int index = start + plength; 145 while (index <= limit) 146 { 147 int pindex = plength; 148 int nindex = index + 1; 149 char ch; 150 do 151 { 152 if ((ch = iterator.setIndex(--index)) != this.pattern[--pindex]) 153 break; 154 if (pindex == 0) 155 return index; 156 } 157 while (pindex > 0); 158 index += this.shiftTable[ch % this.shiftTable.length] + 1; 159 if (index < nindex) 160 index = nindex; 161 } 162 return -1; 163 } 164 165 169 public int matches(String str, int start, int limit) 170 { 171 if (this.ignoreCase) 172 return this.matchesIgnoreCase(str, start, limit); 173 int plength = this.pattern.length; 174 if (plength == 0) 175 return start; 176 int index = start + plength; 177 while (index <= limit) 178 { 179 int pindex = plength; 181 int nindex = index + 1; 182 char ch; 183 do 184 { 185 if ((ch = str.charAt(--index)) != this.pattern[--pindex]) 186 break; 187 if (pindex == 0) 188 return index; 189 } 190 while (pindex > 0); 191 index += this.shiftTable[ch % this.shiftTable.length] + 1; 192 if (index < nindex) 193 index = nindex; 194 } 195 return -1; 196 } 197 198 202 public int matches(char[] chars, int start, int limit) 203 { 204 if (this.ignoreCase) 205 return this.matchesIgnoreCase(chars, start, limit); 206 int plength = this.pattern.length; 207 if (plength == 0) 208 return start; 209 int index = start + plength; 210 while (index <= limit) 211 { 212 int pindex = plength; 214 int nindex = index + 1; 215 char ch; 216 do 217 { 218 if ((ch = chars[--index]) != this.pattern[--pindex]) 219 break; 220 if (pindex == 0) 221 return index; 222 } 223 while (pindex > 0); 224 index += this.shiftTable[ch % this.shiftTable.length] + 1; 225 if (index < nindex) 226 index = nindex; 227 } 228 return -1; 229 } 230 231 int matchesIgnoreCase(CharacterIterator iterator, int start, int limit) 232 { 233 int plength = this.pattern.length; 234 if (plength == 0) 235 return start; 236 int index = start + plength; 237 while (index <= limit) 238 { 239 int pindex = plength; 240 int nindex = index + 1; 241 char ch; 242 do 243 { 244 char ch1 = ch = iterator.setIndex(--index); 245 char ch2 = this.pattern[--pindex]; 246 if (ch1 != ch2) 247 { 248 ch1 = Character.toUpperCase(ch1); 249 ch2 = Character.toUpperCase(ch2); 250 if (ch1 != ch2 && Character.toLowerCase(ch1) != Character.toLowerCase(ch2)) 251 break; 252 } 253 if (pindex == 0) 254 return index; 255 } 256 while (pindex > 0); 257 index += this.shiftTable[ch % this.shiftTable.length] + 1; 258 if (index < nindex) 259 index = nindex; 260 } 261 return -1; 262 } 263 264 int matchesIgnoreCase(String text, int start, int limit) 265 { 266 int plength = this.pattern.length; 267 if (plength == 0) 268 return start; 269 int index = start + plength; 270 while (index <= limit) 271 { 272 int pindex = plength; 273 int nindex = index + 1; 274 char ch; 275 do 276 { 277 char ch1 = ch = text.charAt(--index); 278 char ch2 = this.pattern[--pindex]; 279 if (ch1 != ch2) 280 { 281 ch1 = Character.toUpperCase(ch1); 282 ch2 = Character.toUpperCase(ch2); 283 if (ch1 != ch2 && Character.toLowerCase(ch1) != Character.toLowerCase(ch2)) 284 break; 285 } 286 if (pindex == 0) 287 return index; 288 } 289 while (pindex > 0); 290 index += this.shiftTable[ch % this.shiftTable.length] + 1; 291 if (index < nindex) 292 index = nindex; 293 } 294 return -1; 295 } 296 297 int matchesIgnoreCase(char[] chars, int start, int limit) 298 { 299 int plength = this.pattern.length; 300 if (plength == 0) 301 return start; 302 int index = start + plength; 303 while (index <= limit) 304 { 305 int pindex = plength; 306 int nindex = index + 1; 307 char ch; 308 do 309 { 310 char ch1 = ch = chars[--index]; 311 char ch2 = this.pattern[--pindex]; 312 if (ch1 != ch2) 313 { 314 ch1 = Character.toUpperCase(ch1); 315 ch2 = Character.toUpperCase(ch2); 316 if (ch1 != ch2 && Character.toLowerCase(ch1) != Character.toLowerCase(ch2)) 317 break; 318 } 319 if (pindex == 0) 320 return index; 321 } 322 while (pindex > 0); 323 index += this.shiftTable[ch % this.shiftTable.length] + 1; 324 if (index < nindex) 325 index = nindex; 326 } 327 return -1; 328 } 329 345 } 346 347 public static class Match implements Cloneable { 348 int[] beginpos = null; 349 int[] endpos = null; 350 int nofgroups = 0; 351 352 CharacterIterator ciSource = null; 353 String strSource = null; 354 char[] charSource = null; 355 356 359 public Match() { 360 } 361 362 365 public synchronized Object clone() { 366 Match ma = new Match(); 367 if (this.nofgroups > 0) { 368 ma.setNumberOfGroups(this.nofgroups); 369 if (this.ciSource != null) ma.setSource(this.ciSource); 370 if (this.strSource != null) ma.setSource(this.strSource); 371 for (int i = 0; i < this.nofgroups; i ++) { 372 ma.setBeginning(i, this.getBeginning(i)); 373 ma.setEnd(i, this.getEnd(i)); 374 } 375 } 376 return ma; 377 } 378 379 382 protected void setNumberOfGroups(int n) { 383 int oldn = this.nofgroups; 384 this.nofgroups = n; 385 if (oldn <= 0 386 || oldn < n || n*2 < oldn) { 387 this.beginpos = new int[n]; 388 this.endpos = new int[n]; 389 } 390 for (int i = 0; i < n; i ++) { 391 this.beginpos[i] = -1; 392 this.endpos[i] = -1; 393 } 394 } 395 396 399 protected void setSource(CharacterIterator ci) { 400 this.ciSource = ci; 401 this.strSource = null; 402 this.charSource = null; 403 } 404 407 protected void setSource(String str) { 408 this.ciSource = null; 409 this.strSource = str; 410 this.charSource = null; 411 } 412 415 protected void setSource(char[] chars) { 416 this.ciSource = null; 417 this.strSource = null; 418 this.charSource = chars; 419 } 420 421 424 protected void setBeginning(int index, int v) { 425 this.beginpos[index] = v; 426 } 427 428 431 protected void setEnd(int index, int v) { 432 this.endpos[index] = v; 433 } 434 435 439 public int getNumberOfGroups() { 440 if (this.nofgroups <= 0) 441 throw new IllegalStateException ("A result is not set."); 442 return this.nofgroups; 443 } 444 445 450 public int getBeginning(int index) { 451 if (this.beginpos == null) 452 throw new IllegalStateException ("A result is not set."); 453 if (index < 0 || this.nofgroups <= index) 454 throw new IllegalArgumentException ("The parameter must be less than " 455 +this.nofgroups+": "+index); 456 return this.beginpos[index]; 457 } 458 459 464 public int getEnd(int index) { 465 if (this.endpos == null) 466 throw new IllegalStateException ("A result is not set."); 467 if (index < 0 || this.nofgroups <= index) 468 throw new IllegalArgumentException ("The parameter must be less than " 469 +this.nofgroups+": "+index); 470 return this.endpos[index]; 471 } 472 473 478 public String getCapturedText(int index) { 479 if (this.beginpos == null) 480 throw new IllegalStateException ("match() has never been called."); 481 if (index < 0 || this.nofgroups <= index) 482 throw new IllegalArgumentException ("The parameter must be less than " 483 +this.nofgroups+": "+index); 484 String ret; 485 int begin = this.beginpos[index], end = this.endpos[index]; 486 if (begin < 0 || end < 0) return null; 487 if (this.ciSource != null) { 488 ret = REUtil.substring(this.ciSource, begin, end); 489 } else if (this.strSource != null) { 490 ret = this.strSource.substring(begin, end); 491 } else { 492 ret = new String (this.charSource, begin, end-begin); 493 } 494 return ret; 495 } 496 } 497 498 public final static class REUtil { 499 private REUtil() { 500 } 501 502 static final int composeFromSurrogates(int high, int low) { 503 return 0x10000 + ((high-0xd800)<<10) + low-0xdc00; 504 } 505 506 static final boolean isLowSurrogate(int ch) { 507 return (ch & 0xfc00) == 0xdc00; 508 } 509 510 static final boolean isHighSurrogate(int ch) { 511 return (ch & 0xfc00) == 0xd800; 512 } 513 514 static final String decomposeToSurrogates(int ch) { 515 char[] chs = new char[2]; 516 ch -= 0x10000; 517 chs[0] = (char)((ch>>10)+0xd800); 518 chs[1] = (char)((ch&0x3ff)+0xdc00); 519 return new String (chs); 520 } 521 522 static final String substring(CharacterIterator iterator, int begin, int end) { 523 char[] src = new char[end-begin]; 524 for (int i = 0; i < src.length; i ++) 525 src[i] = iterator.setIndex(i+begin); 526 return new String (src); 527 } 528 529 531 static final int getOptionValue(int ch) { 532 int ret = 0; 533 switch (ch) { 534 case 'i': 535 ret = RegularExpression.IGNORE_CASE; 536 break; 537 case 'm': 538 ret = RegularExpression.MULTIPLE_LINES; 539 break; 540 case 's': 541 ret = RegularExpression.SINGLE_LINE; 542 break; 543 case 'x': 544 ret = RegularExpression.EXTENDED_COMMENT; 545 break; 546 case 'u': 547 ret = RegularExpression.USE_UNICODE_CATEGORY; 548 break; 549 case 'w': 550 ret = RegularExpression.UNICODE_WORD_BOUNDARY; 551 break; 552 case 'F': 553 ret = RegularExpression.PROHIBIT_FIXED_STRING_OPTIMIZATION; 554 break; 555 case 'H': 556 ret = RegularExpression.PROHIBIT_HEAD_CHARACTER_OPTIMIZATION; 557 break; 558 case 'X': 559 ret = RegularExpression.XMLSCHEMA_MODE; 560 break; 561 case ',': 562 ret = RegularExpression.SPECIAL_COMMA; 563 break; 564 default: 565 } 566 return ret; 567 } 568 569 static final int parseOptions(String opts) throws ParseException { 570 if (opts == null) return 0; 571 int options = 0; 572 for (int i = 0; i < opts.length(); i ++) { 573 int v = getOptionValue(opts.charAt(i)); 574 if (v == 0) 575 throw new ParseException("Unknown Option: "+opts.substring(i), -1); 576 options |= v; 577 } 578 return options; 579 } 580 581 static final String createOptionString(int options) { 582 StringBuffer sb = new StringBuffer (9); 583 if ((options & RegularExpression.PROHIBIT_FIXED_STRING_OPTIMIZATION) != 0) 584 sb.append('F'); 585 if ((options & RegularExpression.PROHIBIT_HEAD_CHARACTER_OPTIMIZATION) != 0) 586 sb.append('H'); 587 if ((options & RegularExpression.XMLSCHEMA_MODE) != 0) 588 sb.append('X'); 589 if ((options & RegularExpression.IGNORE_CASE) != 0) 590 sb.append('i'); 591 if ((options & RegularExpression.MULTIPLE_LINES) != 0) 592 sb.append('m'); 593 if ((options & RegularExpression.SINGLE_LINE) != 0) 594 sb.append('s'); 595 if ((options & RegularExpression.USE_UNICODE_CATEGORY) != 0) 596 sb.append('u'); 597 if ((options & RegularExpression.UNICODE_WORD_BOUNDARY) != 0) 598 sb.append('w'); 599 if ((options & RegularExpression.EXTENDED_COMMENT) != 0) 600 sb.append('x'); 601 if ((options & RegularExpression.SPECIAL_COMMA) != 0) 602 sb.append(','); 603 return sb.toString().intern(); 604 } 605 606 608 static String stripExtendedComment(String regex) { 609 int len = regex.length(); 610 StringBuffer buffer = new StringBuffer (len); 611 int offset = 0; 612 while (offset < len) { 613 int ch = regex.charAt(offset++); 614 if (ch == '\t' || ch == '\n' || ch == '\f' || ch == '\r' || ch == ' ') 616 continue; 617 618 if (ch == '#') { while (offset < len) { 620 ch = regex.charAt(offset++); 621 if (ch == '\r' || ch == '\n') 622 break; 623 } 624 continue; 625 } 626 627 int next; if (ch == '\\' && offset < len) { 629 if ((next = regex.charAt(offset)) == '#' 630 || next == '\t' || next == '\n' || next == '\f' 631 || next == '\r' || next == ' ') { 632 buffer.append((char)next); 633 offset ++; 634 } else { buffer.append('\\'); 636 buffer.append((char)next); 637 offset ++; 638 } 639 } else buffer.append((char)ch); 641 } 642 return buffer.toString(); 643 } 644 645 647 651 public static void main(String [] argv) { 652 String pattern = null; 653 try { 654 String options = ""; 655 String target = null; 656 if( argv.length == 0 ) { 657 System.out.println( "Error:Usage: java REUtil -i|-m|-s|-u|-w|-X regularExpression String" ); 658 System.exit( 0 ); 659 } 660 for (int i = 0; i < argv.length; i ++) { 661 if (argv[i].length() == 0 || argv[i].charAt(0) != '-') { 662 if (pattern == null) 663 pattern = argv[i]; 664 else if (target == null) 665 target = argv[i]; 666 else 667 System.err.println("Unnecessary: "+argv[i]); 668 } else if (argv[i].equals("-i")) { 669 options += "i"; 670 } else if (argv[i].equals("-m")) { 671 options += "m"; 672 } else if (argv[i].equals("-s")) { 673 options += "s"; 674 } else if (argv[i].equals("-u")) { 675 options += "u"; 676 } else if (argv[i].equals("-w")) { 677 options += "w"; 678 } else if (argv[i].equals("-X")) { 679 options += "X"; 680 } else { 681 System.err.println("Unknown option: "+argv[i]); 682 } 683 } 684 RegularExpression reg = new RegularExpression(pattern, options); 685 System.out.println("RegularExpression: "+reg); 686 Match match = new Match(); 687 reg.matches(target, match); 688 for (int i = 0; i < match.getNumberOfGroups(); i ++) { 689 if (i == 0 ) System.out.print("Matched range for the whole pattern: "); 690 else System.out.print("["+i+"]: "); 691 if (match.getBeginning(i) < 0) 692 System.out.println("-1"); 693 else { 694 System.out.print(match.getBeginning(i)+", "+match.getEnd(i)+", "); 695 System.out.println("\""+match.getCapturedText(i)+"\""); 696 } 697 } 698 } catch (ParseException pe) { 699 if (pattern == null) { 700 pe.printStackTrace(); 701 } else { 702 System.err.println("org.apache.xerces.utils.regex.ParseException: "+pe.getMessage()); 703 String indent = " "; 704 System.err.println(indent+pattern); 705 int loc = pe.getLocation(); 706 if (loc >= 0) { 707 System.err.print(indent); 708 for (int i = 0; i < loc; i ++) System.err.print("-"); 709 System.err.println("^"); 710 } 711 } 712 } catch (Exception e) { 713 e.printStackTrace(); 714 } 715 } 716 717 static final int CACHESIZE = 20; 718 static final RegularExpression[] regexCache = new RegularExpression[CACHESIZE]; 719 725 public static RegularExpression createRegex(String pattern, String options) 726 throws ParseException { 727 RegularExpression re = null; 728 int intOptions = REUtil.parseOptions(options); 729 synchronized (REUtil.regexCache) { 730 int i; 731 for (i = 0; i < REUtil.CACHESIZE; i ++) { 732 RegularExpression cached = REUtil.regexCache[i]; 733 if (cached == null) { 734 i = -1; 735 break; 736 } 737 if (cached.equals(pattern, intOptions)) { 738 re = cached; 739 break; 740 } 741 } 742 if (re != null) { 743 if (i != 0) { 744 System.arraycopy(REUtil.regexCache, 0, REUtil.regexCache, 1, i); 745 REUtil.regexCache[0] = re; 746 } 747 } else { 748 re = new RegularExpression(pattern, options); 749 System.arraycopy(REUtil.regexCache, 0, REUtil.regexCache, 1, REUtil.CACHESIZE-1); 750 REUtil.regexCache[0] = re; 751 } 752 } 753 return re; 754 } 755 756 760 public static boolean matches(String regex, String target) throws ParseException { 761 return REUtil.createRegex(regex, null).matches(target); 762 } 763 764 768 public static boolean matches(String regex, String options, String target) throws ParseException { 769 return REUtil.createRegex(regex, options).matches(target); 770 } 771 772 774 777 public static String quoteMeta(String literal) { 778 int len = literal.length(); 779 StringBuffer buffer = null; 780 for (int i = 0; i < len; i ++) { 781 int ch = literal.charAt(i); 782 if (".*+?{[()|\\^$".indexOf(ch) >= 0) { 783 if (buffer == null) { 784 buffer = new StringBuffer (i+(len-i)*2); 785 if (i > 0) buffer.append(literal.substring(0, i)); 786 } 787 buffer.append('\\'); 788 buffer.append((char)ch); 789 } else if (buffer != null) 790 buffer.append((char)ch); 791 } 792 return buffer != null ? buffer.toString() : literal; 793 } 794 795 797 static void dumpString(String v) { 798 for (int i = 0; i < v.length(); i ++) { 799 System.out.print(Integer.toHexString(v.charAt(i))); 800 System.out.print(" "); 801 } 802 System.out.println(); 803 } 804 } 805 806 807 1267 public static class RegularExpression implements java.io.Serializable { 1268 static final boolean DEBUG = false; 1269 1270 1273 private synchronized void compile(Token tok) { 1274 if (this.operations != null) 1275 return; 1276 this.numberOfClosures = 0; 1277 this.operations = this.compile(tok, null, false); 1278 } 1279 1280 1283 private Op compile(Token tok, Op next, boolean reverse) { 1284 Op ret; 1285 switch (tok.type) { 1286 case Token.DOT: 1287 ret = Op.createDot(); 1288 ret.next = next; 1289 break; 1290 1291 case Token.CHAR: 1292 ret = Op.createChar(tok.getChar()); 1293 ret.next = next; 1294 break; 1295 1296 case Token.ANCHOR: 1297 ret = Op.createAnchor(tok.getChar()); 1298 ret.next = next; 1299 break; 1300 1301 case Token.RANGE: 1302 case Token.NRANGE: 1303 ret = Op.createRange(tok); 1304 ret.next = next; 1305 break; 1306 1307 case Token.CONCAT: 1308 ret = next; 1309 if (!reverse) { 1310 for (int i = tok.size()-1; i >= 0; i --) { 1311 ret = compile(tok.getChild(i), ret, false); 1312 } 1313 } else { 1314 for (int i = 0; i < tok.size(); i ++) { 1315 ret = compile(tok.getChild(i), ret, true); 1316 } 1317 } 1318 break; 1319 1320 case Token.UNION: 1321 Op.UnionOp uni = Op.createUnion(tok.size()); 1322 for (int i = 0; i < tok.size(); i ++) { 1323 uni.addElement(compile(tok.getChild(i), next, reverse)); 1324 } 1325 ret = uni; break; 1327 1328 case Token.CLOSURE: 1329 case Token.NONGREEDYCLOSURE: 1330 Token child = tok.getChild(0); 1331 int min = tok.getMin(); 1332 int max = tok.getMax(); 1333 if (min >= 0 && min == max) { ret = next; 1335 for (int i = 0; i < min; i ++) { 1336 ret = compile(child, ret, reverse); 1337 } 1338 break; 1339 } 1340 if (min > 0 && max > 0) 1341 max -= min; 1342 if (max > 0) { 1343 ret = next; 1345 for (int i = 0; i < max; i ++) { 1346 Op.ChildOp q = Op.createQuestion(tok.type == Token.NONGREEDYCLOSURE); 1347 q.next = next; 1348 q.setChild(compile(child, ret, reverse)); 1349 ret = q; 1350 } 1351 } else { 1352 Op.ChildOp op; 1353 if (tok.type == Token.NONGREEDYCLOSURE) { 1354 op = Op.createNonGreedyClosure(); 1355 } else { if (child.getMinLength() == 0) 1357 op = Op.createClosure(this.numberOfClosures++); 1358 else 1359 op = Op.createClosure(-1); 1360 } 1361 op.next = next; 1362 op.setChild(compile(child, op, reverse)); 1363 ret = op; 1364 } 1365 if (min > 0) { 1366 for (int i = 0; i < min; i ++) { 1367 ret = compile(child, ret, reverse); 1368 } 1369 } 1370 break; 1371 1372 case Token.EMPTY: 1373 ret = next; 1374 break; 1375 1376 case Token.STRING: 1377 ret = Op.createString(tok.getString()); 1378 ret.next = next; 1379 break; 1380 1381 case Token.BACKREFERENCE: 1382 ret = Op.createBackReference(tok.getReferenceNumber()); 1383 ret.next = next; 1384 break; 1385 1386 case Token.PAREN: 1387 if (tok.getParenNumber() == 0) { 1388 ret = compile(tok.getChild(0), next, reverse); 1389 } else if (reverse) { 1390 next = Op.createCapture(tok.getParenNumber(), next); 1391 next = compile(tok.getChild(0), next, reverse); 1392 ret = Op.createCapture(-tok.getParenNumber(), next); 1393 } else { 1394 next = Op.createCapture(-tok.getParenNumber(), next); 1395 next = compile(tok.getChild(0), next, reverse); 1396 ret = Op.createCapture(tok.getParenNumber(), next); 1397 } 1398 break; 1399 1400 case Token.LOOKAHEAD: 1401 ret = Op.createLook(Op.LOOKAHEAD, next, compile(tok.getChild(0), null, false)); 1402 break; 1403 case Token.NEGATIVELOOKAHEAD: 1404 ret = Op.createLook(Op.NEGATIVELOOKAHEAD, next, compile(tok.getChild(0), null, false)); 1405 break; 1406 case Token.LOOKBEHIND: 1407 ret = Op.createLook(Op.LOOKBEHIND, next, compile(tok.getChild(0), null, true)); 1408 break; 1409 case Token.NEGATIVELOOKBEHIND: 1410 ret = Op.createLook(Op.NEGATIVELOOKBEHIND, next, compile(tok.getChild(0), null, true)); 1411 break; 1412 1413 case Token.INDEPENDENT: 1414 ret = Op.createIndependent(next, compile(tok.getChild(0), null, reverse)); 1415 break; 1416 1417 case Token.MODIFIERGROUP: 1418 ret = Op.createModifier(next, compile(tok.getChild(0), null, reverse), 1419 ((Token.ModifierToken)tok).getOptions(), 1420 ((Token.ModifierToken)tok).getOptionsMask()); 1421 break; 1422 1423 case Token.CONDITION: 1424 Token.ConditionToken ctok = (Token.ConditionToken)tok; 1425 int ref = ctok.refNumber; 1426 Op condition = ctok.condition == null ? null : compile(ctok.condition, null, reverse); 1427 Op yes = compile(ctok.yes, next, reverse); 1428 Op no = ctok.no == null ? null : compile(ctok.no, next, reverse); 1429 ret = Op.createCondition(next, ref, condition, yes, no); 1430 break; 1431 1432 default: 1433 throw new RuntimeException ("Unknown token type: "+tok.type); 1434 } return ret; 1436 } 1437 1438 1439 1441 1446 public boolean matches(char[] target) { 1447 return this.matches(target, 0, target .length , (Match)null); 1448 } 1449 1450 1458 public boolean matches(char[] target, int start, int end) { 1459 return this.matches(target, start, end, (Match)null); 1460 } 1461 1462 1468 public boolean matches(char[] target, Match match) { 1469 return this.matches(target, 0, target .length , match); 1470 } 1471 1472 1473 1482 public boolean matches(char[] target, int start, int end, Match match) { 1483 1484 synchronized (this) { 1485 if (this.operations == null) 1486 this.prepare(); 1487 if (this.context == null) 1488 this.context = new Context(); 1489 } 1490 Context con = null; 1491 synchronized (this.context) { 1492 con = this.context.inuse ? new Context() : this.context; 1493 con.reset(target, start, end, this.numberOfClosures); 1494 } 1495 if (match != null) { 1496 match.setNumberOfGroups(this.nofparen); 1497 match.setSource(target); 1498 } else if (this.hasBackReferences) { 1499 match = new Match(); 1500 match.setNumberOfGroups(this.nofparen); 1501 } 1504 con.match = match; 1505 1506 if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) { 1507 int matchEnd = this. matchCharArray (con, this.operations, con.start, 1, this.options); 1508 if (matchEnd == con.limit) { 1510 if (con.match != null) { 1511 con.match.setBeginning(0, con.start); 1512 con.match.setEnd(0, matchEnd); 1513 } 1514 con.inuse = false; 1515 return true; 1516 } 1517 return false; 1518 } 1519 1520 1524 if (this.fixedStringOnly) { 1525 int o = this.fixedStringTable.matches(target, con.start, con.limit); 1527 if (o >= 0) { 1528 if (con.match != null) { 1529 con.match.setBeginning(0, o); 1530 con.match.setEnd(0, o+this.fixedString.length()); 1531 } 1532 con.inuse = false; 1533 return true; 1534 } 1535 con.inuse = false; 1536 return false; 1537 } 1538 1539 1544 if (this.fixedString != null) { 1545 int o = this.fixedStringTable.matches(target, con.start, con.limit); 1546 if (o < 0) { 1547 con.inuse = false; 1549 return false; 1550 } 1551 } 1552 1553 int limit = con.limit-this.minlength; 1554 int matchStart; 1555 int matchEnd = -1; 1556 1557 1560 if (this.operations != null 1561 && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { 1562 if (isSet(this.options, SINGLE_LINE)) { 1563 matchStart = con.start; 1564 matchEnd = this. matchCharArray (con, this.operations, con.start, 1, this.options); 1565 } else { 1566 boolean previousIsEOL = true; 1567 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 1568 int ch = target [ matchStart ] ; 1569 if (isEOLChar(ch)) { 1570 previousIsEOL = true; 1571 } else { 1572 if (previousIsEOL) { 1573 if (0 <= (matchEnd = this. matchCharArray (con, this.operations, 1574 matchStart, 1, this.options))) 1575 break; 1576 } 1577 previousIsEOL = false; 1578 } 1579 } 1580 } 1581 } 1582 1583 1586 else if (this.firstChar != null) { 1587 RangeToken range = this.firstChar; 1589 if (RegularExpression.isSet(this.options, IGNORE_CASE)) { 1590 range = this.firstChar.getCaseInsensitiveToken(); 1591 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 1592 int ch = target [ matchStart ] ; 1593 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) { 1594 ch = REUtil.composeFromSurrogates(ch, target [ matchStart+1 ] ); 1595 if (!range.match(ch)) continue; 1596 } else { 1597 if (!range.match(ch)) { 1598 char ch1 = Character.toUpperCase((char)ch); 1599 if (!range.match(ch1)) 1600 if (!range.match(Character.toLowerCase(ch1))) 1601 continue; 1602 } 1603 } 1604 if (0 <= (matchEnd = this. matchCharArray (con, this.operations, 1605 matchStart, 1, this.options))) 1606 break; 1607 } 1608 } else { 1609 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 1610 int ch = target [ matchStart ] ; 1611 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) 1612 ch = REUtil.composeFromSurrogates(ch, target [ matchStart+1 ] ); 1613 if (!range.match(ch)) continue; 1614 if (0 <= (matchEnd = this. matchCharArray (con, this.operations, 1615 matchStart, 1, this.options))) 1616 break; 1617 } 1618 } 1619 } 1620 1621 1624 else { 1625 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 1626 if (0 <= (matchEnd = this. matchCharArray (con, this.operations, matchStart, 1, this.options))) 1627 break; 1628 } 1629 } 1630 1631 if (matchEnd >= 0) { 1632 if (con.match != null) { 1633 con.match.setBeginning(0, matchStart); 1634 con.match.setEnd(0, matchEnd); 1635 } 1636 con.inuse = false; 1637 return true; 1638 } else { 1639 con.inuse = false; 1640 return false; 1641 } 1642 } 1643 1644 1647 private int matchCharArray (Context con, Op op, int offset, int dx, int opts) { 1648 1649 char[] target = con.charTarget; 1650 1651 1652 while (true) { 1653 if (op == null) 1654 return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset; 1655 if (offset > con.limit || offset < con.start) 1656 return -1; 1657 switch (op.type) { 1658 case Op.CHAR: 1659 if (isSet(opts, IGNORE_CASE)) { 1660 int ch = op.getData(); 1661 if (dx > 0) { 1662 if (offset >= con.limit || !matchIgnoreCase(ch, target [ offset ] )) 1663 return -1; 1664 offset ++; 1665 } else { 1666 int o1 = offset-1; 1667 if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch, target [ o1 ] )) 1668 return -1; 1669 offset = o1; 1670 } 1671 } else { 1672 int ch = op.getData(); 1673 if (dx > 0) { 1674 if (offset >= con.limit || ch != target [ offset ] ) 1675 return -1; 1676 offset ++; 1677 } else { 1678 int o1 = offset-1; 1679 if (o1 >= con.limit || o1 < 0 || ch != target [ o1 ] ) 1680 return -1; 1681 offset = o1; 1682 } 1683 } 1684 op = op.next; 1685 break; 1686 1687 case Op.DOT: 1688 if (dx > 0) { 1689 if (offset >= con.limit) 1690 return -1; 1691 int ch = target [ offset ] ; 1692 if (isSet(opts, SINGLE_LINE)) { 1693 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) 1694 offset ++; 1695 } else { 1696 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) 1697 ch = REUtil.composeFromSurrogates(ch, target [ ++offset ] ); 1698 if (isEOLChar(ch)) 1699 return -1; 1700 } 1701 offset ++; 1702 } else { 1703 int o1 = offset-1; 1704 if (o1 >= con.limit || o1 < 0) 1705 return -1; 1706 int ch = target [ o1 ] ; 1707 if (isSet(opts, SINGLE_LINE)) { 1708 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) 1709 o1 --; 1710 } else { 1711 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) 1712 ch = REUtil.composeFromSurrogates( target [ --o1 ] , ch); 1713 if (!isEOLChar(ch)) 1714 return -1; 1715 } 1716 offset = o1; 1717 } 1718 op = op.next; 1719 break; 1720 1721 case Op.RANGE: 1722 case Op.NRANGE: 1723 if (dx > 0) { 1724 if (offset >= con.limit) 1725 return -1; 1726 int ch = target [ offset ] ; 1727 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) 1728 ch = REUtil.composeFromSurrogates(ch, target [ ++offset ] ); 1729 RangeToken tok = op.getToken(); 1730 if (isSet(opts, IGNORE_CASE)) { 1731 tok = tok.getCaseInsensitiveToken(); 1732 if (!tok.match(ch)) { 1733 if (ch >= 0x10000) return -1; 1734 char uch; 1735 if (!tok.match(uch = Character.toUpperCase((char)ch)) 1736 && !tok.match(Character.toLowerCase(uch))) 1737 return -1; 1738 } 1739 } else { 1740 if (!tok.match(ch)) return -1; 1741 } 1742 offset ++; 1743 } else { 1744 int o1 = offset-1; 1745 if (o1 >= con.limit || o1 < 0) 1746 return -1; 1747 int ch = target [ o1 ] ; 1748 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) 1749 ch = REUtil.composeFromSurrogates( target [ --o1 ] , ch); 1750 RangeToken tok = op.getToken(); 1751 if (isSet(opts, IGNORE_CASE)) { 1752 tok = tok.getCaseInsensitiveToken(); 1753 if (!tok.match(ch)) { 1754 if (ch >= 0x10000) return -1; 1755 char uch; 1756 if (!tok.match(uch = Character.toUpperCase((char)ch)) 1757 && !tok.match(Character.toLowerCase(uch))) 1758 return -1; 1759 } 1760 } else { 1761 if (!tok.match(ch)) return -1; 1762 } 1763 offset = o1; 1764 } 1765 op = op.next; 1766 break; 1767 1768 case Op.ANCHOR: 1769 boolean go = false; 1770 switch (op.getData()) { 1771 case '^': 1772 if (isSet(opts, MULTIPLE_LINES)) { 1773 if (!(offset == con.start 1774 || offset > con.start && isEOLChar( target [ offset-1 ] ))) 1775 return -1; 1776 } else { 1777 if (offset != con.start) 1778 return -1; 1779 } 1780 break; 1781 1782 case '@': if (!(offset == con.start 1785 || offset > con.start && isEOLChar( target [ offset-1 ] ))) 1786 return -1; 1787 break; 1788 1789 case '$': 1790 if (isSet(opts, MULTIPLE_LINES)) { 1791 if (!(offset == con.limit 1792 || offset < con.limit && isEOLChar( target [ offset ] ))) 1793 return -1; 1794 } else { 1795 if (!(offset == con.limit 1796 || offset+1 == con.limit && isEOLChar( target [ offset ] ) 1797 || offset+2 == con.limit && target [ offset ] == CARRIAGE_RETURN 1798 && target [ offset+1 ] == LINE_FEED)) 1799 return -1; 1800 } 1801 break; 1802 1803 case 'A': 1804 if (offset != con.start) return -1; 1805 break; 1806 1807 case 'Z': 1808 if (!(offset == con.limit 1809 || offset+1 == con.limit && isEOLChar( target [ offset ] ) 1810 || offset+2 == con.limit && target [ offset ] == CARRIAGE_RETURN 1811 && target [ offset+1 ] == LINE_FEED)) 1812 return -1; 1813 break; 1814 1815 case 'z': 1816 if (offset != con.limit) return -1; 1817 break; 1818 1819 case 'b': 1820 if (con.length == 0) return -1; 1821 { 1822 int after = getWordType(target, con.start, con.limit, offset, opts); 1823 if (after == WT_IGNORE) return -1; 1824 int before = getPreviousWordType(target, con.start, con.limit, offset, opts); 1825 if (after == before) return -1; 1826 } 1827 break; 1828 1829 case 'B': 1830 if (con.length == 0) 1831 go = true; 1832 else { 1833 int after = getWordType(target, con.start, con.limit, offset, opts); 1834 go = after == WT_IGNORE 1835 || after == getPreviousWordType(target, con.start, con.limit, offset, opts); 1836 } 1837 if (!go) return -1; 1838 break; 1839 1840 case '<': 1841 if (con.length == 0 || offset == con.limit) return -1; 1842 if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER 1843 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER) 1844 return -1; 1845 break; 1846 1847 case '>': 1848 if (con.length == 0 || offset == con.start) return -1; 1849 if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER 1850 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER) 1851 return -1; 1852 break; 1853 } op = op.next; 1855 break; 1856 1857 case Op.BACKREFERENCE: 1858 { 1859 int refno = op.getData(); 1860 if (refno <= 0 || refno >= this.nofparen) 1861 throw new RuntimeException ("Internal Error: Reference number must be more than zero: "+refno); 1862 if (con.match.getBeginning(refno) < 0 1863 || con.match.getEnd(refno) < 0) 1864 return -1; int o2 = con.match.getBeginning(refno); 1866 int literallen = con.match.getEnd(refno)-o2; 1867 if (!isSet(opts, IGNORE_CASE)) { 1868 if (dx > 0) { 1869 if (!regionMatches(target, offset, con.limit, o2, literallen)) 1870 return -1; 1871 offset += literallen; 1872 } else { 1873 if (!regionMatches(target, offset-literallen, con.limit, o2, literallen)) 1874 return -1; 1875 offset -= literallen; 1876 } 1877 } else { 1878 if (dx > 0) { 1879 if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen)) 1880 return -1; 1881 offset += literallen; 1882 } else { 1883 if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit, 1884 o2, literallen)) 1885 return -1; 1886 offset -= literallen; 1887 } 1888 } 1889 } 1890 op = op.next; 1891 break; 1892 case Op.STRING: 1893 { 1894 String literal = op.getString(); 1895 int literallen = literal.length(); 1896 if (!isSet(opts, IGNORE_CASE)) { 1897 if (dx > 0) { 1898 if (!regionMatches(target, offset, con.limit, literal, literallen)) 1899 return -1; 1900 offset += literallen; 1901 } else { 1902 if (!regionMatches(target, offset-literallen, con.limit, literal, literallen)) 1903 return -1; 1904 offset -= literallen; 1905 } 1906 } else { 1907 if (dx > 0) { 1908 if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen)) 1909 return -1; 1910 offset += literallen; 1911 } else { 1912 if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit, 1913 literal, literallen)) 1914 return -1; 1915 offset -= literallen; 1916 } 1917 } 1918 } 1919 op = op.next; 1920 break; 1921 1922 case Op.CLOSURE: 1923 { 1924 1928 int id = op.getData(); 1929 if (id >= 0) { 1930 int previousOffset = con.offsets[id]; 1931 if (previousOffset < 0 || previousOffset != offset) { 1932 con.offsets[id] = offset; 1933 } else { 1934 con.offsets[id] = -1; 1935 op = op.next; 1936 break; 1937 } 1938 } 1939 1940 int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts); 1941 if (id >= 0) con.offsets[id] = -1; 1942 if (ret >= 0) return ret; 1943 op = op.next; 1944 } 1945 break; 1946 1947 case Op.QUESTION: 1948 { 1949 int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts); 1950 if (ret >= 0) return ret; 1951 op = op.next; 1952 } 1953 break; 1954 1955 case Op.NONGREEDYCLOSURE: 1956 case Op.NONGREEDYQUESTION: 1957 { 1958 int ret = this. matchCharArray (con, op.next, offset, dx, opts); 1959 if (ret >= 0) return ret; 1960 op = op.getChild(); 1961 } 1962 break; 1963 1964 case Op.UNION: 1965 for (int i = 0; i < op.size(); i ++) { 1966 int ret = this. matchCharArray (con, op.elementAt(i), offset, dx, opts); 1967 if (DEBUG) { 1968 System.err.println("UNION: "+i+", ret="+ret); 1969 } 1970 if (ret >= 0) return ret; 1971 } 1972 return -1; 1973 1974 case Op.CAPTURE: 1975 int refno = op.getData(); 1976 if (con.match != null && refno > 0) { 1977 int save = con.match.getBeginning(refno); 1978 con.match.setBeginning(refno, offset); 1979 int ret = this. matchCharArray (con, op.next, offset, dx, opts); 1980 if (ret < 0) con.match.setBeginning(refno, save); 1981 return ret; 1982 } else if (con.match != null && refno < 0) { 1983 int index = -refno; 1984 int save = con.match.getEnd(index); 1985 con.match.setEnd(index, offset); 1986 int ret = this. matchCharArray (con, op.next, offset, dx, opts); 1987 if (ret < 0) con.match.setEnd(index, save); 1988 return ret; 1989 } 1990 op = op.next; 1991 break; 1992 1993 case Op.LOOKAHEAD: 1994 if (0 > this. matchCharArray (con, op.getChild(), offset, 1, opts)) return -1; 1995 op = op.next; 1996 break; 1997 case Op.NEGATIVELOOKAHEAD: 1998 if (0 <= this. matchCharArray (con, op.getChild(), offset, 1, opts)) return -1; 1999 op = op.next; 2000 break; 2001 case Op.LOOKBEHIND: 2002 if (0 > this. matchCharArray (con, op.getChild(), offset, -1, opts)) return -1; 2003 op = op.next; 2004 break; 2005 case Op.NEGATIVELOOKBEHIND: 2006 if (0 <= this. matchCharArray (con, op.getChild(), offset, -1, opts)) return -1; 2007 op = op.next; 2008 break; 2009 2010 case Op.INDEPENDENT: 2011 { 2012 int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts); 2013 if (ret < 0) return ret; 2014 offset = ret; 2015 op = op.next; 2016 } 2017 break; 2018 2019 case Op.MODIFIER: 2020 { 2021 int localopts = opts; 2022 localopts |= op.getData(); 2023 localopts &= ~op.getData2(); 2024 int ret = this. matchCharArray (con, op.getChild(), offset, dx, localopts); 2026 if (ret < 0) return ret; 2027 offset = ret; 2028 op = op.next; 2029 } 2030 break; 2031 2032 case Op.CONDITION: 2033 { 2034 Op.ConditionOp cop = (Op.ConditionOp)op; 2035 boolean matchp = false; 2036 if (cop.refNumber > 0) { 2037 if (cop.refNumber >= this.nofparen) 2038 throw new RuntimeException ("Internal Error: Reference number must be more than zero: "+cop.refNumber); 2039 matchp = con.match.getBeginning(cop.refNumber) >= 0 2040 && con.match.getEnd(cop.refNumber) >= 0; 2041 } else { 2042 matchp = 0 <= this. matchCharArray (con, cop.condition, offset, dx, opts); 2043 } 2044 2045 if (matchp) { 2046 op = cop.yes; 2047 } else if (cop.no != null) { 2048 op = cop.no; 2049 } else { 2050 op = cop.next; 2051 } 2052 } 2053 break; 2054 2055 default: 2056 throw new RuntimeException ("Unknown operation type: "+op.type); 2057 } } } 2060 2061 private static final int getPreviousWordType(char[] target, int begin, int end, 2062 int offset, int opts) { 2063 int ret = getWordType(target, begin, end, --offset, opts); 2064 while (ret == WT_IGNORE) 2065 ret = getWordType(target, begin, end, --offset, opts); 2066 return ret; 2067 } 2068 2069 private static final int getWordType(char[] target, int begin, int end, 2070 int offset, int opts) { 2071 if (offset < begin || offset >= end) return WT_OTHER; 2072 return getWordType0( target [ offset ] , opts); 2073 } 2074 2075 2076 2077 private static final boolean regionMatches(char[] target, int offset, int limit, 2078 String part, int partlen) { 2079 if (offset < 0) return false; 2080 if (limit-offset < partlen) 2081 return false; 2082 int i = 0; 2083 while (partlen-- > 0) { 2084 if ( target [ offset++ ] != part.charAt(i++)) 2085 return false; 2086 } 2087 return true; 2088 } 2089 2090 private static final boolean regionMatches(char[] target, int offset, int limit, 2091 int offset2, int partlen) { 2092 if (offset < 0) return false; 2093 if (limit-offset < partlen) 2094 return false; 2095 int i = offset2; 2096 while (partlen-- > 0) { 2097 if ( target [ offset++ ] != target [ i++ ] ) 2098 return false; 2099 } 2100 return true; 2101 } 2102 2103 2106 private static final boolean regionMatchesIgnoreCase(char[] target, int offset, int limit, 2107 String part, int partlen) { 2108 if (offset < 0) return false; 2109 if (limit-offset < partlen) 2110 return false; 2111 int i = 0; 2112 while (partlen-- > 0) { 2113 char ch1 = target [ offset++ ] ; 2114 char ch2 = part.charAt(i++); 2115 if (ch1 == ch2) 2116 continue; 2117 char uch1 = Character.toUpperCase(ch1); 2118 char uch2 = Character.toUpperCase(ch2); 2119 if (uch1 == uch2) 2120 continue; 2121 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) 2122 return false; 2123 } 2124 return true; 2125 } 2126 2127 private static final boolean regionMatchesIgnoreCase(char[] target, int offset, int limit, 2128 int offset2, int partlen) { 2129 if (offset < 0) return false; 2130 if (limit-offset < partlen) 2131 return false; 2132 int i = offset2; 2133 while (partlen-- > 0) { 2134 char ch1 = target [ offset++ ] ; 2135 char ch2 = target [ i++ ] ; 2136 if (ch1 == ch2) 2137 continue; 2138 char uch1 = Character.toUpperCase(ch1); 2139 char uch2 = Character.toUpperCase(ch2); 2140 if (uch1 == uch2) 2141 continue; 2142 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) 2143 return false; 2144 } 2145 return true; 2146 } 2147 2148 2149 2150 2151 2156 public boolean matches(String target) { 2157 return this.matches(target, 0, target .length() , (Match)null); 2158 } 2159 2160 2168 public boolean matches(String target, int start, int end) { 2169 return this.matches(target, start, end, (Match)null); 2170 } 2171 2172 2178 public boolean matches(String target, Match match) { 2179 return this.matches(target, 0, target .length() , match); 2180 } 2181 2182 2191 public boolean matches(String target, int start, int end, Match match) { 2192 2193 synchronized (this) { 2194 if (this.operations == null) 2195 this.prepare(); 2196 if (this.context == null) 2197 this.context = new Context(); 2198 } 2199 Context con = null; 2200 synchronized (this.context) { 2201 con = this.context.inuse ? new Context() : this.context; 2202 con.reset(target, start, end, this.numberOfClosures); 2203 } 2204 if (match != null) { 2205 match.setNumberOfGroups(this.nofparen); 2206 match.setSource(target); 2207 } else if (this.hasBackReferences) { 2208 match = new Match(); 2209 match.setNumberOfGroups(this.nofparen); 2210 } 2213 con.match = match; 2214 2215 if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) { 2216 if (DEBUG) { 2217 System.err.println("target string="+target); 2218 } 2219 int matchEnd = this. matchString (con, this.operations, con.start, 1, this.options); 2220 if (DEBUG) { 2221 System.err.println("matchEnd="+matchEnd); 2222 System.err.println("con.limit="+con.limit); 2223 } 2224 if (matchEnd == con.limit) { 2225 if (con.match != null) { 2226 con.match.setBeginning(0, con.start); 2227 con.match.setEnd(0, matchEnd); 2228 } 2229 con.inuse = false; 2230 return true; 2231 } 2232 return false; 2233 } 2234 2235 2239 if (this.fixedStringOnly) { 2240 int o = this.fixedStringTable.matches(target, con.start, con.limit); 2242 if (o >= 0) { 2243 if (con.match != null) { 2244 con.match.setBeginning(0, o); 2245 con.match.setEnd(0, o+this.fixedString.length()); 2246 } 2247 con.inuse = false; 2248 return true; 2249 } 2250 con.inuse = false; 2251 return false; 2252 } 2253 2254 2259 if (this.fixedString != null) { 2260 int o = this.fixedStringTable.matches(target, con.start, con.limit); 2261 if (o < 0) { 2262 con.inuse = false; 2264 return false; 2265 } 2266 } 2267 2268 int limit = con.limit-this.minlength; 2269 int matchStart; 2270 int matchEnd = -1; 2271 2272 2275 if (this.operations != null 2276 && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { 2277 if (isSet(this.options, SINGLE_LINE)) { 2278 matchStart = con.start; 2279 matchEnd = this. matchString (con, this.operations, con.start, 1, this.options); 2280 } else { 2281 boolean previousIsEOL = true; 2282 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 2283 int ch = target .charAt( matchStart ) ; 2284 if (isEOLChar(ch)) { 2285 previousIsEOL = true; 2286 } else { 2287 if (previousIsEOL) { 2288 if (0 <= (matchEnd = this. matchString (con, this.operations, 2289 matchStart, 1, this.options))) 2290 break; 2291 } 2292 previousIsEOL = false; 2293 } 2294 } 2295 } 2296 } 2297 2298 2301 else if (this.firstChar != null) { 2302 RangeToken range = this.firstChar; 2304 if (RegularExpression.isSet(this.options, IGNORE_CASE)) { 2305 range = this.firstChar.getCaseInsensitiveToken(); 2306 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 2307 int ch = target .charAt( matchStart ) ; 2308 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) { 2309 ch = REUtil.composeFromSurrogates(ch, target .charAt( matchStart+1 ) ); 2310 if (!range.match(ch)) continue; 2311 } else { 2312 if (!range.match(ch)) { 2313 char ch1 = Character.toUpperCase((char)ch); 2314 if (!range.match(ch1)) 2315 if (!range.match(Character.toLowerCase(ch1))) 2316 continue; 2317 } 2318 } 2319 if (0 <= (matchEnd = this. matchString (con, this.operations, 2320 matchStart, 1, this.options))) 2321 break; 2322 } 2323 } else { 2324 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 2325 int ch = target .charAt( matchStart ) ; 2326 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) 2327 ch = REUtil.composeFromSurrogates(ch, target .charAt( matchStart+1 ) ); 2328 if (!range.match(ch)) continue; 2329 if (0 <= (matchEnd = this. matchString (con, this.operations, 2330 matchStart, 1, this.options))) 2331 break; 2332 } 2333 } 2334 } 2335 2336 2339 else { 2340 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 2341 if (0 <= (matchEnd = this. matchString (con, this.operations, matchStart, 1, this.options))) 2342 break; 2343 } 2344 } 2345 2346 if (matchEnd >= 0) { 2347 if (con.match != null) { 2348 con.match.setBeginning(0, matchStart); 2349 con.match.setEnd(0, matchEnd); 2350 } 2351 con.inuse = false; 2352 return true; 2353 } else { 2354 con.inuse = false; 2355 return false; 2356 } 2357 } 2358 2359 2362 private int matchString (Context con, Op op, int offset, int dx, int opts) { 2363 2364 2365 2366 2367 String target = con.strTarget; 2368 2369 2370 2371 2372 while (true) { 2373 if (op == null) 2374 return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset; 2375 if (offset > con.limit || offset < con.start) 2376 return -1; 2377 switch (op.type) { 2378 case Op.CHAR: 2379 if (isSet(opts, IGNORE_CASE)) { 2380 int ch = op.getData(); 2381 if (dx > 0) { 2382 if (offset >= con.limit || !matchIgnoreCase(ch, target .charAt( offset ) )) 2383 return -1; 2384 offset ++; 2385 } else { 2386 int o1 = offset-1; 2387 if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch, target .charAt( o1 ) )) 2388 return -1; 2389 offset = o1; 2390 } 2391 } else { 2392 int ch = op.getData(); 2393 if (dx > 0) { 2394 if (offset >= con.limit || ch != target .charAt( offset ) ) 2395 return -1; 2396 offset ++; 2397 } else { 2398 int o1 = offset-1; 2399 if (o1 >= con.limit || o1 < 0 || ch != target .charAt( o1 ) ) 2400 return -1; 2401 offset = o1; 2402 } 2403 } 2404 op = op.next; 2405 break; 2406 2407 case Op.DOT: 2408 if (dx > 0) { 2409 if (offset >= con.limit) 2410 return -1; 2411 int ch = target .charAt( offset ) ; 2412 if (isSet(opts, SINGLE_LINE)) { 2413 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) 2414 offset ++; 2415 } else { 2416 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) 2417 ch = REUtil.composeFromSurrogates(ch, target .charAt( ++offset ) ); 2418 if (isEOLChar(ch)) 2419 return -1; 2420 } 2421 offset ++; 2422 } else { 2423 int o1 = offset-1; 2424 if (o1 >= con.limit || o1 < 0) 2425 return -1; 2426 int ch = target .charAt( o1 ) ; 2427 if (isSet(opts, SINGLE_LINE)) { 2428 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) 2429 o1 --; 2430 } else { 2431 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) 2432 ch = REUtil.composeFromSurrogates( target .charAt( --o1 ) , ch); 2433 if (!isEOLChar(ch)) 2434 return -1; 2435 } 2436 offset = o1; 2437 } 2438 op = op.next; 2439 break; 2440 2441 case Op.RANGE: 2442 case Op.NRANGE: 2443 if (dx > 0) { 2444 if (offset >= con.limit) 2445 return -1; 2446 int ch = target .charAt( offset ) ; 2447 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) 2448 ch = REUtil.composeFromSurrogates(ch, target .charAt( ++offset ) ); 2449 RangeToken tok = op.getToken(); 2450 if (isSet(opts, IGNORE_CASE)) { 2451 tok = tok.getCaseInsensitiveToken(); 2452 if (!tok.match(ch)) { 2453 if (ch >= 0x10000) return -1; 2454 char uch; 2455 if (!tok.match(uch = Character.toUpperCase((char)ch)) 2456 && !tok.match(Character.toLowerCase(uch))) 2457 return -1; 2458 } 2459 } else { 2460 if (!tok.match(ch)) return -1; 2461 } 2462 offset ++; 2463 } else { 2464 int o1 = offset-1; 2465 if (o1 >= con.limit || o1 < 0) 2466 return -1; 2467 int ch = target .charAt( o1 ) ; 2468 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) 2469 ch = REUtil.composeFromSurrogates( target .charAt( --o1 ) , ch); 2470 RangeToken tok = op.getToken(); 2471 if (isSet(opts, IGNORE_CASE)) { 2472 tok = tok.getCaseInsensitiveToken(); 2473 if (!tok.match(ch)) { 2474 if (ch >= 0x10000) return -1; 2475 char uch; 2476 if (!tok.match(uch = Character.toUpperCase((char)ch)) 2477 && !tok.match(Character.toLowerCase(uch))) 2478 return -1; 2479 } 2480 } else { 2481 if (!tok.match(ch)) return -1; 2482 } 2483 offset = o1; 2484 } 2485 op = op.next; 2486 break; 2487 2488 case Op.ANCHOR: 2489 boolean go = false; 2490 switch (op.getData()) { 2491 case '^': 2492 if (isSet(opts, MULTIPLE_LINES)) { 2493 if (!(offset == con.start 2494 || offset > con.start && isEOLChar( target .charAt( offset-1 ) ))) 2495 return -1; 2496 } else { 2497 if (offset != con.start) 2498 return -1; 2499 } 2500 break; 2501 2502 case '@': if (!(offset == con.start 2505 || offset > con.start && isEOLChar( target .charAt( offset-1 ) ))) 2506 return -1; 2507 break; 2508 2509 case '$': 2510 if (isSet(opts, MULTIPLE_LINES)) { 2511 if (!(offset == con.limit 2512 || offset < con.limit && isEOLChar( target .charAt( offset ) ))) 2513 return -1; 2514 } else { 2515 if (!(offset == con.limit 2516 || offset+1 == con.limit && isEOLChar( target .charAt( offset ) ) 2517 || offset+2 == con.limit && target .charAt( offset ) == CARRIAGE_RETURN 2518 && target .charAt( offset+1 ) == LINE_FEED)) 2519 return -1; 2520 } 2521 break; 2522 2523 case 'A': 2524 if (offset != con.start) return -1; 2525 break; 2526 2527 case 'Z': 2528 if (!(offset == con.limit 2529 || offset+1 == con.limit && isEOLChar( target .charAt( offset ) ) 2530 || offset+2 == con.limit && target .charAt( offset ) == CARRIAGE_RETURN 2531 && target .charAt( offset+1 ) == LINE_FEED)) 2532 return -1; 2533 break; 2534 2535 case 'z': 2536 if (offset != con.limit) return -1; 2537 break; 2538 2539 case 'b': 2540 if (con.length == 0) return -1; 2541 { 2542 int after = getWordType(target, con.start, con.limit, offset, opts); 2543 if (after == WT_IGNORE) return -1; 2544 int before = getPreviousWordType(target, con.start, con.limit, offset, opts); 2545 if (after == before) return -1; 2546 } 2547 break; 2548 2549 case 'B': 2550 if (con.length == 0) 2551 go = true; 2552 else { 2553 int after = getWordType(target, con.start, con.limit, offset, opts); 2554 go = after == WT_IGNORE 2555 || after == getPreviousWordType(target, con.start, con.limit, offset, opts); 2556 } 2557 if (!go) return -1; 2558 break; 2559 2560 case '<': 2561 if (con.length == 0 || offset == con.limit) return -1; 2562 if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER 2563 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER) 2564 return -1; 2565 break; 2566 2567 case '>': 2568 if (con.length == 0 || offset == con.start) return -1; 2569 if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER 2570 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER) 2571 return -1; 2572 break; 2573 } op = op.next; 2575 break; 2576 2577 case Op.BACKREFERENCE: 2578 { 2579 int refno = op.getData(); 2580 if (refno <= 0 || refno >= this.nofparen) 2581 throw new RuntimeException ("Internal Error: Reference number must be more than zero: "+refno); 2582 if (con.match.getBeginning(refno) < 0 2583 || con.match.getEnd(refno) < 0) 2584 return -1; int o2 = con.match.getBeginning(refno); 2586 int literallen = con.match.getEnd(refno)-o2; 2587 if (!isSet(opts, IGNORE_CASE)) { 2588 if (dx > 0) { 2589 if (!regionMatches(target, offset, con.limit, o2, literallen)) 2590 return -1; 2591 offset += literallen; 2592 } else { 2593 if (!regionMatches(target, offset-literallen, con.limit, o2, literallen)) 2594 return -1; 2595 offset -= literallen; 2596 } 2597 } else { 2598 if (dx > 0) { 2599 if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen)) 2600 return -1; 2601 offset += literallen; 2602 } else { 2603 if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit, 2604 o2, literallen)) 2605 return -1; 2606 offset -= literallen; 2607 } 2608 } 2609 } 2610 op = op.next; 2611 break; 2612 case Op.STRING: 2613 { 2614 String literal = op.getString(); 2615 int literallen = literal.length(); 2616 if (!isSet(opts, IGNORE_CASE)) { 2617 if (dx > 0) { 2618 if (!regionMatches(target, offset, con.limit, literal, literallen)) 2619 return -1; 2620 offset += literallen; 2621 } else { 2622 if (!regionMatches(target, offset-literallen, con.limit, literal, literallen)) 2623 return -1; 2624 offset -= literallen; 2625 } 2626 } else { 2627 if (dx > 0) { 2628 if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen)) 2629 return -1; 2630 offset += literallen; 2631 } else { 2632 if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit, 2633 literal, literallen)) 2634 return -1; 2635 offset -= literallen; 2636 } 2637 } 2638 } 2639 op = op.next; 2640 break; 2641 2642 case Op.CLOSURE: 2643 { 2644 2648 int id = op.getData(); 2649 if (id >= 0) { 2650 int previousOffset = con.offsets[id]; 2651 if (previousOffset < 0 || previousOffset != offset) { 2652 con.offsets[id] = offset; 2653 } else { 2654 con.offsets[id] = -1; 2655 op = op.next; 2656 break; 2657 } 2658 } 2659 int ret = this. matchString (con, op.getChild(), offset, dx, opts); 2660 if (id >= 0) con.offsets[id] = -1; 2661 if (ret >= 0) return ret; 2662 op = op.next; 2663 } 2664 break; 2665 2666 case Op.QUESTION: 2667 { 2668 int ret = this. matchString (con, op.getChild(), offset, dx, opts); 2669 if (ret >= 0) return ret; 2670 op = op.next; 2671 } 2672 break; 2673 2674 case Op.NONGREEDYCLOSURE: 2675 case Op.NONGREEDYQUESTION: 2676 { 2677 int ret = this. matchString (con, op.next, offset, dx, opts); 2678 if (ret >= 0) return ret; 2679 op = op.getChild(); 2680 } 2681 break; 2682 2683 case Op.UNION: 2684 for (int i = 0; i < op.size(); i ++) { 2685 int ret = this. matchString (con, op.elementAt(i), offset, dx, opts); 2686 if (DEBUG) { 2687 System.err.println("UNION: "+i+", ret="+ret); 2688 } 2689 if (ret >= 0) return ret; 2690 } 2691 return -1; 2692 2693 case Op.CAPTURE: 2694 int refno = op.getData(); 2695 if (con.match != null && refno > 0) { 2696 int save = con.match.getBeginning(refno); 2697 con.match.setBeginning(refno, offset); 2698 int ret = this. matchString (con, op.next, offset, dx, opts); 2699 if (ret < 0) con.match.setBeginning(refno, save); 2700 return ret; 2701 } else if (con.match != null && refno < 0) { 2702 int index = -refno; 2703 int save = con.match.getEnd(index); 2704 con.match.setEnd(index, offset); 2705 int ret = this. matchString (con, op.next, offset, dx, opts); 2706 if (ret < 0) con.match.setEnd(index, save); 2707 return ret; 2708 } 2709 op = op.next; 2710 break; 2711 2712 case Op.LOOKAHEAD: 2713 if (0 > this. matchString (con, op.getChild(), offset, 1, opts)) return -1; 2714 op = op.next; 2715 break; 2716 case Op.NEGATIVELOOKAHEAD: 2717 if (0 <= this. matchString (con, op.getChild(), offset, 1, opts)) return -1; 2718 op = op.next; 2719 break; 2720 case Op.LOOKBEHIND: 2721 if (0 > this. matchString (con, op.getChild(), offset, -1, opts)) return -1; 2722 op = op.next; 2723 break; 2724 case Op.NEGATIVELOOKBEHIND: 2725 if (0 <= this. matchString (con, op.getChild(), offset, -1, opts)) return -1; 2726 op = op.next; 2727 break; 2728 2729 case Op.INDEPENDENT: 2730 { 2731 int ret = this. matchString (con, op.getChild(), offset, dx, opts); 2732 if (ret < 0) return ret; 2733 offset = ret; 2734 op = op.next; 2735 } 2736 break; 2737 2738 case Op.MODIFIER: 2739 { 2740 int localopts = opts; 2741 localopts |= op.getData(); 2742 localopts &= ~op.getData2(); 2743 int ret = this. matchString (con, op.getChild(), offset, dx, localopts); 2745 if (ret < 0) return ret; 2746 offset = ret; 2747 op = op.next; 2748 } 2749 break; 2750 2751 case Op.CONDITION: 2752 { 2753 Op.ConditionOp cop = (Op.ConditionOp)op; 2754 boolean matchp = false; 2755 if (cop.refNumber > 0) { 2756 if (cop.refNumber >= this.nofparen) 2757 throw new RuntimeException ("Internal Error: Reference number must be more than zero: "+cop.refNumber); 2758 matchp = con.match.getBeginning(cop.refNumber) >= 0 2759 && con.match.getEnd(cop.refNumber) >= 0; 2760 } else { 2761 matchp = 0 <= this. matchString (con, cop.condition, offset, dx, opts); 2762 } 2763 2764 if (matchp) { 2765 op = cop.yes; 2766 } else if (cop.no != null) { 2767 op = cop.no; 2768 } else { 2769 op = cop.next; 2770 } 2771 } 2772 break; 2773 2774 default: 2775 throw new RuntimeException ("Unknown operation type: "+op.type); 2776 } } } 2779 2780 private static final int getPreviousWordType(String target, int begin, int end, 2781 int offset, int opts) { 2782 int ret = getWordType(target, begin, end, --offset, opts); 2783 while (ret == WT_IGNORE) 2784 ret = getWordType(target, begin, end, --offset, opts); 2785 return ret; 2786 } 2787 2788 private static final int getWordType(String target, int begin, int end, 2789 int offset, int opts) { 2790 if (offset < begin || offset >= end) return WT_OTHER; 2791 return getWordType0( target .charAt( offset ) , opts); 2792 } 2793 2794 2795 private static final boolean regionMatches(String text, int offset, int limit, 2796 String part, int partlen) { 2797 if (limit-offset < partlen) return false; 2798 return text.regionMatches(offset, part, 0, partlen); 2799 } 2800 2801 private static final boolean regionMatches(String text, int offset, int limit, 2802 int offset2, int partlen) { 2803 if (limit-offset < partlen) return false; 2804 return text.regionMatches(offset, text, offset2, partlen); 2805 } 2806 2807 private static final boolean regionMatchesIgnoreCase(String text, int offset, int limit, 2808 String part, int partlen) { 2809 return text.regionMatches(true, offset, part, 0, partlen); 2810 } 2811 2812 private static final boolean regionMatchesIgnoreCase(String text, int offset, int limit, 2813 int offset2, int partlen) { 2814 if (limit-offset < partlen) return false; 2815 return text.regionMatches(true, offset, text, offset2, partlen); 2816 } 2817 2818 2819 2820 2821 2822 2823 2824 2829 public boolean matches(CharacterIterator target) { 2830 return this.matches(target, (Match)null); 2831 } 2832 2833 2834 2840 public boolean matches(CharacterIterator target, Match match) { 2841 int start = target.getBeginIndex(); 2842 int end = target.getEndIndex(); 2843 2844 2845 2846 synchronized (this) { 2847 if (this.operations == null) 2848 this.prepare(); 2849 if (this.context == null) 2850 this.context = new Context(); 2851 } 2852 Context con = null; 2853 synchronized (this.context) { 2854 con = this.context.inuse ? new Context() : this.context; 2855 con.reset(target, start, end, this.numberOfClosures); 2856 } 2857 if (match != null) { 2858 match.setNumberOfGroups(this.nofparen); 2859 match.setSource(target); 2860 } else if (this.hasBackReferences) { 2861 match = new Match(); 2862 match.setNumberOfGroups(this.nofparen); 2863 } 2866 con.match = match; 2867 2868 if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) { 2869 int matchEnd = this. matchCharacterIterator (con, this.operations, con.start, 1, this.options); 2870 if (matchEnd == con.limit) { 2872 if (con.match != null) { 2873 con.match.setBeginning(0, con.start); 2874 con.match.setEnd(0, matchEnd); 2875 } 2876 con.inuse = false; 2877 return true; 2878 } 2879 return false; 2880 } 2881 2882 2886 if (this.fixedStringOnly) { 2887 int o = this.fixedStringTable.matches(target, con.start, con.limit); 2889 if (o >= 0) { 2890 if (con.match != null) { 2891 con.match.setBeginning(0, o); 2892 con.match.setEnd(0, o+this.fixedString.length()); 2893 } 2894 con.inuse = false; 2895 return true; 2896 } 2897 con.inuse = false; 2898 return false; 2899 } 2900 2901 2906 if (this.fixedString != null) { 2907 int o = this.fixedStringTable.matches(target, con.start, con.limit); 2908 if (o < 0) { 2909 con.inuse = false; 2911 return false; 2912 } 2913 } 2914 2915 int limit = con.limit-this.minlength; 2916 int matchStart; 2917 int matchEnd = -1; 2918 2919 2922 if (this.operations != null 2923 && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { 2924 if (isSet(this.options, SINGLE_LINE)) { 2925 matchStart = con.start; 2926 matchEnd = this. matchCharacterIterator (con, this.operations, con.start, 1, this.options); 2927 } else { 2928 boolean previousIsEOL = true; 2929 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 2930 int ch = target .setIndex( matchStart ) ; 2931 if (isEOLChar(ch)) { 2932 previousIsEOL = true; 2933 } else { 2934 if (previousIsEOL) { 2935 if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations, 2936 matchStart, 1, this.options))) 2937 break; 2938 } 2939 previousIsEOL = false; 2940 } 2941 } 2942 } 2943 } 2944 2945 2948 else if (this.firstChar != null) { 2949 RangeToken range = this.firstChar; 2951 if (RegularExpression.isSet(this.options, IGNORE_CASE)) { 2952 range = this.firstChar.getCaseInsensitiveToken(); 2953 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 2954 int ch = target .setIndex( matchStart ) ; 2955 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) { 2956 ch = REUtil.composeFromSurrogates(ch, target .setIndex( matchStart+1 ) ); 2957 if (!range.match(ch)) continue; 2958 } else { 2959 if (!range.match(ch)) { 2960 char ch1 = Character.toUpperCase((char)ch); 2961 if (!range.match(ch1)) 2962 if (!range.match(Character.toLowerCase(ch1))) 2963 continue; 2964 } 2965 } 2966 if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations, 2967 matchStart, 1, this.options))) 2968 break; 2969 } 2970 } else { 2971 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 2972 int ch = target .setIndex( matchStart ) ; 2973 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) 2974 ch = REUtil.composeFromSurrogates(ch, target .setIndex( matchStart+1 ) ); 2975 if (!range.match(ch)) continue; 2976 if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations, 2977 matchStart, 1, this.options))) 2978 break; 2979 } 2980 } 2981 } 2982 2983 2986 else { 2987 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 2988 if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations, matchStart, 1, this.options))) 2989 break; 2990 } 2991 } 2992 2993 if (matchEnd >= 0) { 2994 if (con.match != null) { 2995 con.match.setBeginning(0, matchStart); 2996 con.match.setEnd(0, matchEnd); 2997 } 2998 con.inuse = false; 2999 return true; 3000 } else { 3001 con.inuse = false; 3002 return false; 3003 } 3004 } 3005 3006 3009 private int matchCharacterIterator (Context con, Op op, int offset, int dx, int opts) { 3010 3011 3012 CharacterIterator target = con.ciTarget; 3013 3014 3015 3016 3017 3018 3019 while (true) { 3020 if (op == null) 3021 return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset; 3022 if (offset > con.limit || offset < con.start) 3023 return -1; 3024 switch (op.type) { 3025 case Op.CHAR: 3026 if (isSet(opts, IGNORE_CASE)) { 3027 int ch = op.getData(); 3028 if (dx > 0) { 3029 if (offset >= con.limit || !matchIgnoreCase(ch, target .setIndex( offset ) )) 3030 return -1; 3031 offset ++; 3032 } else { 3033 int o1 = offset-1; 3034 if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch, target .setIndex( o1 ) )) 3035 return -1; 3036 offset = o1; 3037 } 3038 } else { 3039 int ch = op.getData(); 3040 if (dx > 0) { 3041 if (offset >= con.limit || ch != target .setIndex( offset ) ) 3042 return -1; 3043 offset ++; 3044 } else { 3045 int o1 = offset-1; 3046 if (o1 >= con.limit || o1 < 0 || ch != target .setIndex( o1 ) ) 3047 return -1; 3048 offset = o1; 3049 } 3050 } 3051 op = op.next; 3052 break; 3053 3054 case Op.DOT: 3055 if (dx > 0) { 3056 if (offset >= con.limit) 3057 return -1; 3058 int ch = target .setIndex( offset ) ; 3059 if (isSet(opts, SINGLE_LINE)) { 3060 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) 3061 offset ++; 3062 } else { 3063 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) 3064 ch = REUtil.composeFromSurrogates(ch, target .setIndex( ++offset ) ); 3065 if (isEOLChar(ch)) 3066 return -1; 3067 } 3068 offset ++; 3069 } else { 3070 int o1 = offset-1; 3071 if (o1 >= con.limit || o1 < 0) 3072 return -1; 3073 int ch = target .setIndex( o1 ) ; 3074 if (isSet(opts, SINGLE_LINE)) { 3075 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) 3076 o1 --; 3077 } else { 3078 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) 3079 ch = REUtil.composeFromSurrogates( target .setIndex( --o1 ) , ch); 3080 if (!isEOLChar(ch)) 3081 return -1; 3082 } 3083 offset = o1; 3084 } 3085 op = op.next; 3086 break; 3087 3088 case Op.RANGE: 3089 case Op.NRANGE: 3090 if (dx > 0) { 3091 if (offset >= con.limit) 3092 return -1; 3093 int ch = target .setIndex( offset ) ; 3094 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) 3095 ch = REUtil.composeFromSurrogates(ch, target .setIndex( ++offset ) ); 3096 RangeToken tok = op.getToken(); 3097 if (isSet(opts, IGNORE_CASE)) { 3098 tok = tok.getCaseInsensitiveToken(); 3099 if (!tok.match(ch)) { 3100 if (ch >= 0x10000) return -1; 3101 char uch; 3102 if (!tok.match(uch = Character.toUpperCase((char)ch)) 3103 && !tok.match(Character.toLowerCase(uch))) 3104 return -1; 3105 } 3106 } else { 3107 if (!tok.match(ch)) return -1; 3108 } 3109 offset ++; 3110 } else { 3111 int o1 = offset-1; 3112 if (o1 >= con.limit || o1 < 0) 3113 return -1; 3114 int ch = target .setIndex( o1 ) ; 3115 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) 3116 ch = REUtil.composeFromSurrogates( target .setIndex( --o1 ) , ch); 3117 RangeToken tok = op.getToken(); 3118 if (isSet(opts, IGNORE_CASE)) { 3119 tok = tok.getCaseInsensitiveToken(); 3120 if (!tok.match(ch)) { 3121 if (ch >= 0x10000) return -1; 3122 char uch; 3123 if (!tok.match(uch = Character.toUpperCase((char)ch)) 3124 && !tok.match(Character.toLowerCase(uch))) 3125 return -1; 3126 } 3127 } else { 3128 if (!tok.match(ch)) return -1; 3129 } 3130 offset = o1; 3131 } 3132 op = op.next; 3133 break; 3134 3135 case Op.ANCHOR: 3136 boolean go = false; 3137 switch (op.getData()) { 3138 case '^': 3139 if (isSet(opts, MULTIPLE_LINES)) { 3140 if (!(offset == con.start 3141 || offset > con.start && isEOLChar( target .setIndex( offset-1 ) ))) 3142 return -1; 3143 } else { 3144 if (offset != con.start) 3145 return -1; 3146 } 3147 break; 3148 3149 case '@': if (!(offset == con.start 3152 || offset > con.start && isEOLChar( target .setIndex( offset-1 ) ))) 3153 return -1; 3154 break; 3155 3156 case '$': 3157 if (isSet(opts, MULTIPLE_LINES)) { 3158 if (!(offset == con.limit 3159 || offset < con.limit && isEOLChar( target .setIndex( offset ) ))) 3160 return -1; 3161 } else { 3162 if (!(offset == con.limit 3163 || offset+1 == con.limit && isEOLChar( target .setIndex( offset ) ) 3164 || offset+2 == con.limit && target .setIndex( offset ) == CARRIAGE_RETURN 3165 && target .setIndex( offset+1 ) == LINE_FEED)) 3166 return -1; 3167 } 3168 break; 3169 3170 case 'A': 3171 if (offset != con.start) return -1; 3172 break; 3173 3174 case 'Z': 3175 if (!(offset == con.limit 3176 || offset+1 == con.limit && isEOLChar( target .setIndex( offset ) ) 3177 || offset+2 == con.limit && target .setIndex( offset ) == CARRIAGE_RETURN 3178 && target .setIndex( offset+1 ) == LINE_FEED)) 3179 return -1; 3180 break; 3181 3182 case 'z': 3183 if (offset != con.limit) return -1; 3184 break; 3185 3186 case 'b': 3187 if (con.length == 0) return -1; 3188 { 3189 int after = getWordType(target, con.start, con.limit, offset, opts); 3190 if (after == WT_IGNORE) return -1; 3191 int before = getPreviousWordType(target, con.start, con.limit, offset, opts); 3192 if (after == before) return -1; 3193 } 3194 break; 3195 3196 case 'B': 3197 if (con.length == 0) 3198 go = true; 3199 else { 3200 int after = getWordType(target, con.start, con.limit, offset, opts); 3201 go = after == WT_IGNORE 3202 || after == getPreviousWordType(target, con.start, con.limit, offset, opts); 3203 } 3204 if (!go) return -1; 3205 break; 3206 3207 case '<': 3208 if (con.length == 0 || offset == con.limit) return -1; 3209 if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER 3210 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER) 3211 return -1; 3212 break; 3213 3214 case '>': 3215 if (con.length == 0 || offset == con.start) return -1; 3216 if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER 3217 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER) 3218 return -1; 3219 break; 3220 } op = op.next; 3222 break; 3223 3224 case Op.BACKREFERENCE: 3225 { 3226 int refno = op.getData(); 3227 if (refno <= 0 || refno >= this.nofparen) 3228 throw new RuntimeException ("Internal Error: Reference number must be more than zero: "+refno); 3229 if (con.match.getBeginning(refno) < 0 3230 || con.match.getEnd(refno) < 0) 3231 return -1; int o2 = con.match.getBeginning(refno); 3233 int literallen = con.match.getEnd(refno)-o2; 3234 if (!isSet(opts, IGNORE_CASE)) { 3235 if (dx > 0) { 3236 if (!regionMatches(target, offset, con.limit, o2, literallen)) 3237 return -1; 3238 offset += literallen; 3239 } else { 3240 if (!regionMatches(target, offset-literallen, con.limit, o2, literallen)) 3241 return -1; 3242 offset -= literallen; 3243 } 3244 } else { 3245 if (dx > 0) { 3246 if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen)) 3247 return -1; 3248 offset += literallen; 3249 } else { 3250 if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit, 3251 o2, literallen)) 3252 return -1; 3253 offset -= literallen; 3254 } 3255 } 3256 } 3257 op = op.next; 3258 break; 3259 case Op.STRING: 3260 { 3261 String literal = op.getString(); 3262 int literallen = literal.length(); 3263 if (!isSet(opts, IGNORE_CASE)) { 3264 if (dx > 0) { 3265 if (!regionMatches(target, offset, con.limit, literal, literallen)) 3266 return -1; 3267 offset += literallen; 3268 } else { 3269 if (!regionMatches(target, offset-literallen, con.limit, literal, literallen)) 3270 return -1; 3271 offset -= literallen; 3272 } 3273 } else { 3274 if (dx > 0) { 3275 if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen)) 3276 return -1; 3277 offset += literallen; 3278 } else { 3279 if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit, 3280 literal, literallen)) 3281 return -1; 3282 offset -= literallen; 3283 } 3284 } 3285 } 3286 op = op.next; 3287 break; 3288 3289 case Op.CLOSURE: 3290 { 3291 3295 int id = op.getData(); 3296 if (id >= 0) { 3297 int previousOffset = con.offsets[id]; 3298 if (previousOffset < 0 || previousOffset != offset) { 3299 con.offsets[id] = offset; 3300 } else { 3301 con.offsets[id] = -1; 3302 op = op.next; 3303 break; 3304 } 3305 } 3306 3307 int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts); 3308 if (id >= 0) con.offsets[id] = -1; 3309 if (ret >= 0) return ret; 3310 op = op.next; 3311 } 3312 break; 3313 3314 case Op.QUESTION: 3315 { 3316 int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts); 3317 if (ret >= 0) return ret; 3318 op = op.next; 3319 } 3320 break; 3321 3322 case Op.NONGREEDYCLOSURE: 3323 case Op.NONGREEDYQUESTION: 3324 { 3325 int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts); 3326 if (ret >= 0) return ret; 3327 op = op.getChild(); 3328 } 3329 break; 3330 3331 case Op.UNION: 3332 for (int i = 0; i < op.size(); i ++) { 3333 int ret = this. matchCharacterIterator (con, op.elementAt(i), offset, dx, opts); 3334 if (DEBUG) { 3335 System.err.println("UNION: "+i+", ret="+ret); 3336 } 3337 if (ret >= 0) return ret; 3338 } 3339 return -1; 3340 3341 case Op.CAPTURE: 3342 int refno = op.getData(); 3343 if (con.match != null && refno > 0) { 3344 int save = con.match.getBeginning(refno); 3345 con.match.setBeginning(refno, offset); 3346 int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts); 3347 if (ret < 0) con.match.setBeginning(refno, save); 3348 return ret; 3349 } else if (con.match != null && refno < 0) { 3350 int index = -refno; 3351 int save = con.match.getEnd(index); 3352 con.match.setEnd(index, offset); 3353 int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts); 3354 if (ret < 0) con.match.setEnd(index, save); 3355 return ret; 3356 } 3357 op = op.next; 3358 break; 3359 3360 case Op.LOOKAHEAD: 3361 if (0 > this. matchCharacterIterator (con, op.getChild(), offset, 1, opts)) return -1; 3362 op = op.next; 3363 break; 3364 case Op.NEGATIVELOOKAHEAD: 3365 if (0 <= this. matchCharacterIterator (con, op.getChild(), offset, 1, opts)) return -1; 3366 op = op.next; 3367 break; 3368 case Op.LOOKBEHIND: 3369 if (0 > this. matchCharacterIterator (con, op.getChild(), offset, -1, opts)) return -1; 3370 op = op.next; 3371 break; 3372 case Op.NEGATIVELOOKBEHIND: 3373 if (0 <= this. matchCharacterIterator (con, op.getChild(), offset, -1, opts)) return -1; 3374 op = op.next; 3375 break; 3376 3377 case Op.INDEPENDENT: 3378 { 3379 int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts); 3380 if (ret < 0) return ret; 3381 offset = ret; 3382 op = op.next; 3383 } 3384 break; 3385 3386 case Op.MODIFIER: 3387 { 3388 int localopts = opts; 3389 localopts |= op.getData(); 3390 localopts &= ~op.getData2(); 3391 int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, localopts); 3393 if (ret < 0) return ret; 3394 offset = ret; 3395 op = op.next; 3396 } 3397 break; 3398 3399 case Op.CONDITION: 3400 { 3401 Op.ConditionOp cop = (Op.ConditionOp)op; 3402 boolean matchp = false; 3403 if (cop.refNumber > 0) { 3404 if (cop.refNumber >= this.nofparen) 3405 throw new RuntimeException ("Internal Error: Reference number must be more than zero: "+cop.refNumber); 3406 matchp = con.match.getBeginning(cop.refNumber) >= 0 3407 && con.match.getEnd(cop.refNumber) >= 0; 3408 } else { 3409 matchp = 0 <= this. matchCharacterIterator (con, cop.condition, offset, dx, opts); 3410 } 3411 3412 if (matchp) { 3413 op = cop.yes; 3414 } else if (cop.no != null) { 3415 op = cop.no; 3416 } else { 3417 op = cop.next; 3418 } 3419 } 3420 break; 3421 3422 default: 3423 throw new RuntimeException ("Unknown operation type: "+op.type); 3424 } } } 3427 3428 private static final int getPreviousWordType(CharacterIterator target, int begin, int end, 3429 int offset, int opts) { 3430 int ret = getWordType(target, begin, end, --offset, opts); 3431 while (ret == WT_IGNORE) 3432 ret = getWordType(target, begin, end, --offset, opts); 3433 return ret; 3434 } 3435 3436 private static final int getWordType(CharacterIterator target, int begin, int end, 3437 int offset, int opts) { 3438 if (offset < begin || offset >= end) return WT_OTHER; 3439 return getWordType0( target .setIndex( offset ) , opts); 3440 } 3441 3442 3443 3444 private static final boolean regionMatches(CharacterIterator target, int offset, int limit, 3445 String part, int partlen) { 3446 if (offset < 0) return false; 3447 if (limit-offset < partlen) 3448 return false; 3449 int i = 0; 3450 while (partlen-- > 0) { 3451 if ( target .setIndex( offset++ ) != part.charAt(i++)) 3452 return false; 3453 } 3454 return true; 3455 } 3456 3457 private static final boolean regionMatches(CharacterIterator target, int offset, int limit, 3458 int offset2, int partlen) { 3459 if (offset < 0) return false; 3460 if (limit-offset < partlen) 3461 return false; 3462 int i = offset2; 3463 while (partlen-- > 0) { 3464 if ( target .setIndex( offset++ ) != target .setIndex( i++ ) ) 3465 return false; 3466 } 3467 return true; 3468 } 3469 3470 3473 private static final boolean regionMatchesIgnoreCase(CharacterIterator target, int offset, int limit, 3474 String part, int partlen) { 3475 if (offset < 0) return false; 3476 if (limit-offset < partlen) 3477 return false; 3478 int i = 0; 3479 while (partlen-- > 0) { 3480 char ch1 = target .setIndex( offset++ ) ; 3481 char ch2 = part.charAt(i++); 3482 if (ch1 == ch2) 3483 continue; 3484 char uch1 = Character.toUpperCase(ch1); 3485 char uch2 = Character.toUpperCase(ch2); 3486 if (uch1 == uch2) 3487 continue; 3488 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) 3489 return false; 3490 } 3491 return true; 3492 } 3493 3494 private static final boolean regionMatchesIgnoreCase(CharacterIterator target, int offset, int limit, 3495 int offset2, int partlen) { 3496 if (offset < 0) return false; 3497 if (limit-offset < partlen) 3498 return false; 3499 int i = offset2; 3500 while (partlen-- > 0) { 3501 char ch1 = target .setIndex( offset++ ) ; 3502 char ch2 = target .setIndex( i++ ) ; 3503 if (ch1 == ch2) 3504 continue; 3505 char uch1 = Character.toUpperCase(ch1); 3506 char uch2 = Character.toUpperCase(ch2); 3507 if (uch1 == uch2) 3508 continue; 3509 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) 3510 return false; 3511 } 3512 return true; 3513 } 3514 3515 3516 3517 3518 3520 3524 String regex; 3525 3528 int options; 3529 3530 3534 int nofparen; 3535 3539 Token tokentree; 3540 3541 boolean hasBackReferences = false; 3542 3543 transient int minlength; 3544 transient Op operations = null; 3545 transient int numberOfClosures; 3546 transient Context context = null; 3547 transient RangeToken firstChar = null; 3548 3549 transient String fixedString = null; 3550 transient int fixedStringOptions; 3551 transient BMPattern fixedStringTable = null; 3552 transient boolean fixedStringOnly = false; 3553 3554 3555 static final class Context { 3556 CharacterIterator ciTarget; 3557 String strTarget; 3558 char[] charTarget; 3559 int start; 3560 int limit; 3561 int length; 3562 Match match; 3563 boolean inuse = false; 3564 int[] offsets; 3565 3566 Context() { 3567 } 3568 3569 private void resetCommon(int nofclosures) { 3570 this.length = this.limit-this.start; 3571 this.inuse = true; 3572 this.match = null; 3573 if (this.offsets == null || this.offsets.length != nofclosures) 3574 this.offsets = new int[nofclosures]; 3575 for (int i = 0; i < nofclosures; i ++) this.offsets[i] = -1; 3576 } 3577 void reset(CharacterIterator target, int start, int limit, int nofclosures) { 3578 this.ciTarget = target; 3579 this.start = start; 3580 this.limit = limit; 3581 this.resetCommon(nofclosures); 3582 } 3583 void reset(String target, int start, int limit, int nofclosures) { 3584 this.strTarget = target; 3585 this.start = start; 3586 this.limit = limit; 3587 this.resetCommon(nofclosures); 3588 } 3589 void reset(char[] target, int start, int limit, int nofclosures) { 3590 this.charTarget = target; 3591 this.start = start; 3592 this.limit = limit; 3593 this.resetCommon(nofclosures); 3594 } 3595 } 3596 3597 3600 void prepare() { 3601 if (Op.COUNT) Op.nofinstances = 0; 3602 this.compile(this.tokentree); 3603 3610 if (Op.COUNT) System.err.println("DEBUG: The number of operations: "+Op.nofinstances); 3611 3612 this.minlength = this.tokentree.getMinLength(); 3613 3614 this.firstChar = null; 3615 if (!isSet(this.options, PROHIBIT_HEAD_CHARACTER_OPTIMIZATION) 3616 && !isSet(this.options, XMLSCHEMA_MODE)) { 3617 RangeToken firstChar = Token.createRange(); 3618 int fresult = this.tokentree.analyzeFirstCharacter(firstChar, this.options); 3619 if (fresult == Token.FC_TERMINAL) { 3620 firstChar.compactRanges(); 3621 this.firstChar = firstChar; 3622 if (DEBUG) 3623 System.err.println("DEBUG: Use the first character optimization: "+firstChar); 3624 } 3625 } 3626 3627 if (this.operations != null 3628 && (this.operations.type == Op.STRING || this.operations.type == Op.CHAR) 3629 && this.operations.next == null) { 3630 if (DEBUG) 3631 System.err.print(" *** Only fixed string! *** "); 3632 this.fixedStringOnly = true; 3633 if (this.operations.type == Op.STRING) 3634 this.fixedString = this.operations.getString(); 3635 else if (this.operations.getData() >= 0x10000) { this.fixedString = REUtil.decomposeToSurrogates(this.operations.getData()); 3637 } else { 3638 char[] ac = new char[1]; 3639 ac[0] = (char)this.operations.getData(); 3640 this.fixedString = new String (ac); 3641 } 3642 this.fixedStringOptions = this.options; 3643 this.fixedStringTable = new BMPattern(this.fixedString, 256, 3644 isSet(this.fixedStringOptions, IGNORE_CASE)); 3645 } else if (!isSet(this.options, PROHIBIT_FIXED_STRING_OPTIMIZATION) 3646 && !isSet(this.options, XMLSCHEMA_MODE)) { 3647 Token.FixedStringContainer container = new Token.FixedStringContainer(); 3648 this.tokentree.findFixedString(container, this.options); 3649 this.fixedString = container.token == null ? null : container.token.getString(); 3650 this.fixedStringOptions = container.options; 3651 if (this.fixedString != null && this.fixedString.length() < 2) 3652 this.fixedString = null; 3653 if (this.fixedString != null) { 3655 this.fixedStringTable = new BMPattern(this.fixedString, 256, 3656 isSet(this.fixedStringOptions, IGNORE_CASE)); 3657 if (DEBUG) { 3658 System.err.println("DEBUG: The longest fixed string: "+this.fixedString.length() 3659 +"/" +"/"+REUtil.createOptionString(this.fixedStringOptions)); 3661 System.err.print("String: "); 3662 REUtil.dumpString(this.fixedString); 3663 } 3664 } 3665 } 3666 } 3667 3668 3678 3679 3682 static final int IGNORE_CASE = 1<<1; 3683 3684 3687 static final int SINGLE_LINE = 1<<2; 3688 3689 3692 static final int MULTIPLE_LINES = 1<<3; 3693 3694 3697 static final int EXTENDED_COMMENT = 1<<4; 3698 3699 3706 static final int USE_UNICODE_CATEGORY = 1<<5; 3708 3720 static final int UNICODE_WORD_BOUNDARY = 1<<6; 3722 3725 static final int PROHIBIT_HEAD_CHARACTER_OPTIMIZATION = 1<<7; 3726 3729 static final int PROHIBIT_FIXED_STRING_OPTIMIZATION = 1<<8; 3730 3733 static final int XMLSCHEMA_MODE = 1<<9; 3734 3737 static final int SPECIAL_COMMA = 1<<10; 3738 3739 3740 private static final boolean isSet(int options, int flag) { 3741 return (options & flag) == flag; 3742 } 3743 3744 3750 public RegularExpression(String regex) throws ParseException { 3751 this.setPattern(regex, null); 3752 } 3753 3754 3761 public RegularExpression(String regex, String options) throws ParseException { 3762 this.setPattern(regex, options); 3763 } 3764 3765 RegularExpression(String regex, Token tok, int parens, boolean hasBackReferences, int options) { 3766 this.regex = regex; 3767 this.tokentree = tok; 3768 this.nofparen = parens; 3769 this.options = options; 3770 this.hasBackReferences = hasBackReferences; 3771 } 3772 3773 3776 public void setPattern(String newPattern) throws ParseException { 3777 this.setPattern(newPattern, this.options); 3778 } 3779 3780 private void setPattern(String newPattern, int options) throws ParseException { 3781 this.regex = newPattern; 3782 this.options = options; 3783 RegexParser rp = RegularExpression.isSet(this.options, RegularExpression.XMLSCHEMA_MODE) 3784 ? new ParserForXMLSchema() : new RegexParser(); 3785 this.tokentree = rp.parse(this.regex, this.options); 3786 this.nofparen = rp.parennumber; 3787 this.hasBackReferences = rp.hasBackReferences; 3788 3789 this.operations = null; 3790 this.context = null; 3791 } 3792 3795 public void setPattern(String newPattern, String options) throws ParseException { 3796 this.setPattern(newPattern, REUtil.parseOptions(options)); 3797 } 3798 3799 3802 public String getPattern() { 3803 return this.regex; 3804 } 3805 3806 3809 public String toString() { 3810 return this.tokentree.toString(this.options); 3811 } 3812 3813 3821 public String getOptions() { 3822 return REUtil.createOptionString(this.options); 3823 } 3824 3825 3828 public boolean equals(Object obj) { 3829 if (obj == null) return false; 3830 if (!(obj instanceof RegularExpression)) 3831 return false; 3832 RegularExpression r = (RegularExpression)obj; 3833 return this.regex.equals(r.regex) && this.options == r.options; 3834 } 3835 3836 boolean equals(String pattern, int options) { 3837 return this.regex.equals(pattern) && this.options == options; 3838 } 3839 3840 3843 public int hashCode() { 3844 return (this.regex+"/"+this.getOptions()).hashCode(); 3845 } 3846 3847 3852 public int getNumberOfGroups() { 3853 return this.nofparen; 3854 } 3855 3856 3858 private static final int WT_IGNORE = 0; 3859 private static final int WT_LETTER = 1; 3860 private static final int WT_OTHER = 2; 3861 private static final int getWordType0(char ch, int opts) { 3862 if (!isSet(opts, UNICODE_WORD_BOUNDARY)) { 3863 if (isSet(opts, USE_UNICODE_CATEGORY)) { 3864 return (Token.getRange("IsWord", true).match(ch)) ? WT_LETTER : WT_OTHER; 3865 } 3866 return isWordChar(ch) ? WT_LETTER : WT_OTHER; 3867 } 3868 3869 switch (Character.getType(ch)) { 3870 case Character.UPPERCASE_LETTER: case Character.LOWERCASE_LETTER: case Character.TITLECASE_LETTER: case Character.MODIFIER_LETTER: case Character.OTHER_LETTER: case Character.LETTER_NUMBER: case Character.DECIMAL_DIGIT_NUMBER: case Character.OTHER_NUMBER: case Character.COMBINING_SPACING_MARK: return WT_LETTER; 3880 3881 case Character.FORMAT: case Character.NON_SPACING_MARK: case Character.ENCLOSING_MARK: return WT_IGNORE; 3885 3886 case Character.CONTROL: switch (ch) { 3888 case '\t': 3889 case '\n': 3890 case '\u000B': 3891 case '\f': 3892 case '\r': 3893 return WT_OTHER; 3894 default: 3895 return WT_IGNORE; 3896 } 3897 3898 default: 3899 return WT_OTHER; 3900 } 3901 } 3902 3903 3905 static final int LINE_FEED = 0x000A; 3906 static final int CARRIAGE_RETURN = 0x000D; 3907 static final int LINE_SEPARATOR = 0x2028; 3908 static final int PARAGRAPH_SEPARATOR = 0x2029; 3909 3910 private static final boolean isEOLChar(int ch) { 3911 return ch == LINE_FEED || ch == CARRIAGE_RETURN || ch == LINE_SEPARATOR 3912 || ch == PARAGRAPH_SEPARATOR; 3913 } 3914 3915 private static final boolean isWordChar(int ch) { if (ch == '_') return true; 3917 if (ch < '0') return false; 3918 if (ch > 'z') return false; 3919 if (ch <= '9') return true; 3920 if (ch < 'A') return false; 3921 if (ch <= 'Z') return true; 3922 if (ch < 'a') return false; 3923 return true; 3924 } 3925 3926 private static final boolean matchIgnoreCase(int chardata, int ch) { 3927 if (chardata == ch) return true; 3928 if (chardata > 0xffff || ch > 0xffff) return false; 3929 char uch1 = Character.toUpperCase((char)chardata); 3930 char uch2 = Character.toUpperCase((char)ch); 3931 if (uch1 == uch2) return true; 3932 return Character.toLowerCase(uch1) == Character.toLowerCase(uch2); 3933 } 3934 } 3935 3936 public static class ParseException extends RuntimeException { 3937 int location; 3938 3939 3944 3947 public ParseException(String mes, int location) { 3948 super(mes); 3949 this.location = location; 3950 } 3951 3952 3956 public int getLocation() { 3957 return this.location; 3958 } 3959} 3960 3961 static class Op { 3962 static final int DOT = 0; 3963 static final int CHAR = 1; static final int RANGE = 3; static final int NRANGE = 4; static final int ANCHOR = 5; static final int STRING = 6; static final int CLOSURE = 7; static final int NONGREEDYCLOSURE = 8; static final int QUESTION = 9; static final int NONGREEDYQUESTION = 10; static final int UNION = 11; static final int CAPTURE = 15; static final int BACKREFERENCE = 16; static final int LOOKAHEAD = 20; static final int NEGATIVELOOKAHEAD = 21; static final int LOOKBEHIND = 22; static final int NEGATIVELOOKBEHIND = 23; static final int INDEPENDENT = 24; static final int MODIFIER = 25; static final int CONDITION = 26; 3983 static int nofinstances = 0; 3984 static final boolean COUNT = false; 3985 3986 static Op createDot() { 3987 if (Op.COUNT) Op.nofinstances ++; 3988 return new Op(Op.DOT); 3989 } 3990 static CharOp createChar(int data) { 3991 if (Op.COUNT) Op.nofinstances ++; 3992 return new CharOp(Op.CHAR, data); 3993 } 3994 static CharOp createAnchor(int data) { 3995 if (Op.COUNT) Op.nofinstances ++; 3996 return new CharOp(Op.ANCHOR, data); 3997 } 3998 static CharOp createCapture(int number, Op next) { 3999 if (Op.COUNT) Op.nofinstances ++; 4000 CharOp op = new CharOp(Op.CAPTURE, number); 4001 op.next = next; 4002 return op; 4003 } 4004 static UnionOp createUnion(int size) { 4005 if (Op.COUNT) Op.nofinstances ++; 4006 return new UnionOp(Op.UNION, size); 4008 } 4009 static ChildOp createClosure(int id) { 4010 if (Op.COUNT) Op.nofinstances ++; 4011 return new ModifierOp(Op.CLOSURE, id, -1); 4012 } 4013 static ChildOp createNonGreedyClosure() { 4014 if (Op.COUNT) Op.nofinstances ++; 4015 return new ChildOp(Op.NONGREEDYCLOSURE); 4016 } 4017 static ChildOp createQuestion(boolean nongreedy) { 4018 if (Op.COUNT) Op.nofinstances ++; 4019 return new ChildOp(nongreedy ? Op.NONGREEDYQUESTION : Op.QUESTION); 4020 } 4021 static RangeOp createRange(Token tok) { 4022 if (Op.COUNT) Op.nofinstances ++; 4023 return new RangeOp(Op.RANGE, tok); 4024 } 4025 static ChildOp createLook(int type, Op next, Op branch) { 4026 if (Op.COUNT) Op.nofinstances ++; 4027 ChildOp op = new ChildOp(type); 4028 op.setChild(branch); 4029 op.next = next; 4030 return op; 4031 } 4032 static CharOp createBackReference(int refno) { 4033 if (Op.COUNT) Op.nofinstances ++; 4034 return new CharOp(Op.BACKREFERENCE, refno); 4035 } 4036 static StringOp createString(String literal) { 4037 if (Op.COUNT) Op.nofinstances ++; 4038 return new StringOp(Op.STRING, literal); 4039 } 4040 static ChildOp createIndependent(Op next, Op branch) { 4041 if (Op.COUNT) Op.nofinstances ++; 4042 ChildOp op = new ChildOp(Op.INDEPENDENT); 4043 op.setChild(branch); 4044 op.next = next; 4045 return op; 4046 } 4047 static ModifierOp createModifier(Op next, Op branch, int add, int mask) { 4048 if (Op.COUNT) Op.nofinstances ++; 4049 ModifierOp op = new ModifierOp(Op.MODIFIER, add, mask); 4050 op.setChild(branch); 4051 op.next = next; 4052 return op; 4053 } 4054 static ConditionOp createCondition(Op next, int ref, Op conditionflow, Op yesflow, Op noflow) { 4055 if (Op.COUNT) Op.nofinstances ++; 4056 ConditionOp op = new ConditionOp(Op.CONDITION, ref, conditionflow, yesflow, noflow); 4057 op.next = next; 4058 return op; 4059 } 4060 4061 int type; 4062 Op next = null; 4063 4064 protected Op(int type) { 4065 this.type = type; 4066 } 4067 4068 int size() { return 0; 4070 } 4071 Op elementAt(int index) { throw new RuntimeException ("Internal Error: type="+this.type); 4073 } 4074 Op getChild() { throw new RuntimeException ("Internal Error: type="+this.type); 4076 } 4077 int getData() { throw new RuntimeException ("Internal Error: type="+this.type); 4080 } 4081 int getData2() { throw new RuntimeException ("Internal Error: type="+this.type); 4083 } 4084 RangeToken getToken() { throw new RuntimeException ("Internal Error: type="+this.type); 4086 } 4087 String getString() { throw new RuntimeException ("Internal Error: type="+this.type); 4089 } 4090 4091 static class CharOp extends Op { 4093 int charData; 4094 CharOp(int type, int data) { 4095 super(type); 4096 this.charData = data; 4097 } 4098 int getData() { 4099 return this.charData; 4100 } 4101 } 4102 4103 static class UnionOp extends Op { 4105 Vector branches; 4106 UnionOp(int type, int size) { 4107 super(type); 4108 this.branches = new Vector (size); 4109 } 4110 void addElement(Op op) { 4111 this.branches.addElement(op); 4112 } 4113 int size() { 4114 return this.branches.size(); 4115 } 4116 Op elementAt(int index) { 4117 return (Op)this.branches.elementAt(index); 4118 } 4119 } 4120 4121 static class ChildOp extends Op { 4123 Op child; 4124 ChildOp(int type) { 4125 super(type); 4126 } 4127 void setChild(Op child) { 4128 this.child = child; 4129 } 4130 Op getChild() { 4131 return this.child; 4132 } 4133 } 4134 static class ModifierOp extends ChildOp { 4136 int v1; 4137 int v2; 4138 ModifierOp(int type, int v1, int v2) { 4139 super(type); 4140 this.v1 = v1; 4141 this.v2 = v2; 4142 } 4143 int getData() { 4144 return this.v1; 4145 } 4146 int getData2() { 4147 return this.v2; 4148 } 4149 } 4150 static class RangeOp extends Op { 4152 Token tok; 4153 RangeOp(int type, Token tok) { 4154 super(type); 4155 this.tok = tok; 4156 } 4157 RangeToken getToken() { 4158 return (RangeToken)this.tok; 4159 } 4160 } 4161 static class StringOp extends Op { 4163 String string; 4164 StringOp(int type, String literal) { 4165 super(type); 4166 this.string = literal; 4167 } 4168 String getString() { 4169 return this.string; 4170 } 4171 } 4172 static class ConditionOp extends Op { 4174 int refNumber; 4175 Op condition; 4176 Op yes; 4177 Op no; 4178 ConditionOp(int type, int refno, Op conditionflow, Op yesflow, Op noflow) { 4179 super(type); 4180 this.refNumber = refno; 4181 this.condition = conditionflow; 4182 this.yes = yesflow; 4183 this.no = noflow; 4184 } 4185 } 4186} 4187 4188 final static class RangeToken extends Token implements java.io.Serializable { 4189 4190 int[] ranges; 4191 boolean sorted; 4192 boolean compacted; 4193 RangeToken icaseCache = null; 4194 int[] map = null; 4195 int nonMapIndex; 4196 4197 RangeToken(int type) { 4198 super(type); 4199 this.setSorted(false); 4200 } 4201 4202 protected void addRange(int start, int end) { 4204 this.icaseCache = null; 4205 int r1, r2; 4207 if (start <= end) { 4208 r1 = start; 4209 r2 = end; 4210 } else { 4211 r1 = end; 4212 r2 = start; 4213 } 4214 4215 int pos = 0; 4216 if (this.ranges == null) { 4217 this.ranges = new int[2]; 4218 this.ranges[0] = r1; 4219 this.ranges[1] = r2; 4220 this.setSorted(true); 4221 } else { 4222 pos = this.ranges.length; 4223 if (this.ranges[pos-1]+1 == r1) { 4224 this.ranges[pos-1] = r2; 4225 return; 4226 } 4227 int[] temp = new int[pos+2]; 4228 System.arraycopy(this.ranges, 0, temp, 0, pos); 4229 this.ranges = temp; 4230 if (this.ranges[pos-1] >= r1) 4231 this.setSorted(false); 4232 this.ranges[pos++] = r1; 4233 this.ranges[pos] = r2; 4234 if (!this.sorted) 4235 this.sortRanges(); 4236 } 4237 } 4238 4239 private final boolean isSorted() { 4240 return this.sorted; 4241 } 4242 private final void setSorted(boolean sort) { 4243 this.sorted = sort; 4244 if (!sort) this.compacted = false; 4245 } 4246 private final boolean isCompacted() { 4247 return this.compacted; 4248 } 4249 private final void setCompacted() { 4250 this.compacted = true; 4251 } 4252 4253 protected void sortRanges() { 4254 if (this.isSorted()) 4255 return; 4256 if (this.ranges == null) 4257 return; 4258 4260 for (int i = this.ranges.length-4; i >= 0; i -= 2) { 4264 for (int j = 0; j <= i; j += 2) { 4265 if (this.ranges[j] > this.ranges[j+2] 4266 || this.ranges[j] == this.ranges[j+2] && this.ranges[j+1] > this.ranges[j+3]) { 4267 int tmp; 4268 tmp = this.ranges[j+2]; 4269 this.ranges[j+2] = this.ranges[j]; 4270 this.ranges[j] = tmp; 4271 tmp = this.ranges[j+3]; 4272 this.ranges[j+3] = this.ranges[j+1]; 4273 this.ranges[j+1] = tmp; 4274 } 4275 } 4276 } 4277 this.setSorted(true); 4278 } 4279 4280 4283 protected void compactRanges() { 4284 boolean DEBUG = false; 4285 if (this.ranges == null || this.ranges.length <= 2) 4286 return; 4287 if (this.isCompacted()) 4288 return; 4289 int base = 0; int target = 0; 4292 while (target < this.ranges.length) { 4293 if (base != target) { 4294 this.ranges[base] = this.ranges[target++]; 4295 this.ranges[base+1] = this.ranges[target++]; 4296 } else 4297 target += 2; 4298 int baseend = this.ranges[base+1]; 4299 while (target < this.ranges.length) { 4300 if (baseend+1 < this.ranges[target]) 4301 break; 4302 if (baseend+1 == this.ranges[target]) { 4303 if (DEBUG) 4304 System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base] 4305 +", "+this.ranges[base+1] 4306 +"], ["+this.ranges[target] 4307 +", "+this.ranges[target+1] 4308 +"] -> ["+this.ranges[base] 4309 +", "+this.ranges[target+1] 4310 +"]"); 4311 this.ranges[base+1] = this.ranges[target+1]; 4312 baseend = this.ranges[base+1]; 4313 target += 2; 4314 } else if (baseend >= this.ranges[target+1]) { 4315 if (DEBUG) 4316 System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base] 4317 +", "+this.ranges[base+1] 4318 +"], ["+this.ranges[target] 4319 +", "+this.ranges[target+1] 4320 +"] -> ["+this.ranges[base] 4321 +", "+this.ranges[base+1] 4322 +"]"); 4323 target += 2; 4324 } else if (baseend < this.ranges[target+1]) { 4325 if (DEBUG) 4326 System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base] 4327 +", "+this.ranges[base+1] 4328 +"], ["+this.ranges[target] 4329 +", "+this.ranges[target+1] 4330 +"] -> ["+this.ranges[base] 4331 +", "+this.ranges[target+1] 4332 +"]"); 4333 this.ranges[base+1] = this.ranges[target+1]; 4334 baseend = this.ranges[base+1]; 4335 target += 2; 4336 } else { 4337 throw new RuntimeException ("Token#compactRanges(): Internel Error: [" 4338 +this.ranges[base] 4339 +","+this.ranges[base+1] 4340 +"] ["+this.ranges[target] 4341 +","+this.ranges[target+1]+"]"); 4342 } 4343 } base += 2; 4345 } 4346 4347 if (base != this.ranges.length) { 4348 int[] result = new int[base]; 4349 System.arraycopy(this.ranges, 0, result, 0, base); 4350 this.ranges = result; 4351 } 4352 this.setCompacted(); 4353 } 4354 4355 protected void mergeRanges(Token token) { 4356 RangeToken tok = (RangeToken)token; 4357 this.sortRanges(); 4358 tok.sortRanges(); 4359 if (tok.ranges == null) 4360 return; 4361 this.icaseCache = null; 4362 this.setSorted(true); 4363 if (this.ranges == null) { 4364 this.ranges = new int[tok.ranges.length]; 4365 System.arraycopy(tok.ranges, 0, this.ranges, 0, tok.ranges.length); 4366 return; 4367 } 4368 int[] result = new int[this.ranges.length+tok.ranges.length]; 4369 for (int i = 0, j = 0, k = 0; i < this.ranges.length || j < tok.ranges.length;) { 4370 if (i >= this.ranges.length) { 4371 result[k++] = tok.ranges[j++]; 4372 result[k++] = tok.ranges[j++]; 4373 } else if (j >= tok.ranges.length) { 4374 result[k++] = this.ranges[i++]; 4375 result[k++] = this.ranges[i++]; 4376 } else if (tok.ranges[j] < this.ranges[i] 4377 || tok.ranges[j] == this.ranges[i] && tok.ranges[j+1] < this.ranges[i+1]) { 4378 result[k++] = tok.ranges[j++]; 4379 result[k++] = tok.ranges[j++]; 4380 } else { 4381 result[k++] = this.ranges[i++]; 4382 result[k++] = this.ranges[i++]; 4383 } 4384 } 4385 this.ranges = result; 4386 } 4387 4388 protected void subtractRanges(Token token) { 4389 if (token.type == NRANGE) { 4390 this.intersectRanges(token); 4391 return; 4392 } 4393 RangeToken tok = (RangeToken)token; 4394 if (tok.ranges == null || this.ranges == null) 4395 return; 4396 this.icaseCache = null; 4397 this.sortRanges(); 4398 this.compactRanges(); 4399 tok.sortRanges(); 4400 tok.compactRanges(); 4401 4402 4404 int[] result = new int[this.ranges.length+tok.ranges.length]; 4405 int wp = 0, src = 0, sub = 0; 4406 while (src < this.ranges.length && sub < tok.ranges.length) { 4407 int srcbegin = this.ranges[src]; 4408 int srcend = this.ranges[src+1]; 4409 int subbegin = tok.ranges[sub]; 4410 int subend = tok.ranges[sub+1]; 4411 if (srcend < subbegin) { result[wp++] = this.ranges[src++]; 4417 result[wp++] = this.ranges[src++]; 4418 } else if (srcend >= subbegin 4419 && srcbegin <= subend) { if (subbegin <= srcbegin && srcend <= subend) { 4426 src += 2; 4431 } else if (subbegin <= srcbegin) { 4432 this.ranges[src] = subend+1; 4437 sub += 2; 4438 } else if (srcend <= subend) { 4439 result[wp++] = srcbegin; 4444 result[wp++] = subbegin-1; 4445 src += 2; 4446 } else { 4447 result[wp++] = srcbegin; 4452 result[wp++] = subbegin-1; 4453 this.ranges[src] = subend+1; 4454 sub += 2; 4455 } 4456 } else if (subend < srcbegin) { 4457 sub += 2; 4461 } else { 4462 throw new RuntimeException ("Token#subtractRanges(): Internal Error: ["+this.ranges[src] 4463 +","+this.ranges[src+1] 4464 +"] - ["+tok.ranges[sub] 4465 +","+tok.ranges[sub+1] 4466 +"]"); 4467 } 4468 } 4469 while (src < this.ranges.length) { 4470 result[wp++] = this.ranges[src++]; 4471 result[wp++] = this.ranges[src++]; 4472 } 4473 this.ranges = new int[wp]; 4474 System.arraycopy(result, 0, this.ranges, 0, wp); 4475 } 4477 4478 4481 protected void intersectRanges(Token token) { 4482 RangeToken tok = (RangeToken)token; 4483 if (tok.ranges == null || this.ranges == null) 4484 return; 4485 this.icaseCache = null; 4486 this.sortRanges(); 4487 this.compactRanges(); 4488 tok.sortRanges(); 4489 tok.compactRanges(); 4490 4491 int[] result = new int[this.ranges.length+tok.ranges.length]; 4492 int wp = 0, src1 = 0, src2 = 0; 4493 while (src1 < this.ranges.length && src2 < tok.ranges.length) { 4494 int src1begin = this.ranges[src1]; 4495 int src1end = this.ranges[src1+1]; 4496 int src2begin = tok.ranges[src2]; 4497 int src2end = tok.ranges[src2+1]; 4498 if (src1end < src2begin) { src1 += 2; 4504 } else if (src1end >= src2begin 4505 && src1begin <= src2end) { if (src2begin <= src2begin && src1end <= src2end) { 4512 result[wp++] = src1begin; 4517 result[wp++] = src1end; 4518 src1 += 2; 4519 } else if (src2begin <= src1begin) { 4520 result[wp++] = src1begin; 4525 result[wp++] = src2end; 4526 this.ranges[src1] = src2end+1; 4527 src2 += 2; 4528 } else if (src1end <= src2end) { 4529 result[wp++] = src2begin; 4534 result[wp++] = src1end; 4535 src1 += 2; 4536 } else { 4537 result[wp++] = src2begin; 4542 result[wp++] = src2end; 4543 this.ranges[src1] = src2end+1; 4544 } 4545 } else if (src2end < src1begin) { 4546 src2 += 2; 4550 } else { 4551 throw new RuntimeException ("Token#intersectRanges(): Internal Error: [" 4552 +this.ranges[src1] 4553 +","+this.ranges[src1+1] 4554 +"] & ["+tok.ranges[src2] 4555 +","+tok.ranges[src2+1] 4556 +"]"); 4557 } 4558 } 4559 while (src1 < this.ranges.length) { 4560 result[wp++] = this.ranges[src1++]; 4561 result[wp++] = this.ranges[src1++]; 4562 } 4563 this.ranges = new int[wp]; 4564 System.arraycopy(result, 0, this.ranges, 0, wp); 4565 } 4567 4568 4572 static Token complementRanges(Token token) { 4573 if (token.type != RANGE && token.type != NRANGE) 4574 throw new IllegalArgumentException ("Token#complementRanges(): must be RANGE: "+token.type); 4575 RangeToken tok = (RangeToken)token; 4576 tok.sortRanges(); 4577 tok.compactRanges(); 4578 int len = tok.ranges.length+2; 4579 if (tok.ranges[0] == 0) 4580 len -= 2; 4581 int last = tok.ranges[tok.ranges.length-1]; 4582 if (last == UTF16_MAX) 4583 len -= 2; 4584 RangeToken ret = Token.createRange(); 4585 ret.ranges = new int[len]; 4586 int wp = 0; 4587 if (tok.ranges[0] > 0) { 4588 ret.ranges[wp++] = 0; 4589 ret.ranges[wp++] = tok.ranges[0]-1; 4590 } 4591 for (int i = 1; i < tok.ranges.length-2; i += 2) { 4592 ret.ranges[wp++] = tok.ranges[i]+1; 4593 ret.ranges[wp++] = tok.ranges[i+1]-1; 4594 } 4595 if (last != UTF16_MAX) { 4596 ret.ranges[wp++] = last+1; 4597 ret.ranges[wp] = UTF16_MAX; 4598 } 4599 ret.setCompacted(); 4600 return ret; 4601 } 4602 4603 synchronized RangeToken getCaseInsensitiveToken() { 4604 if (this.icaseCache != null) 4605 return this.icaseCache; 4606 4607 RangeToken uppers = this.type == Token.RANGE ? Token.createRange() : Token.createNRange(); 4608 for (int i = 0; i < this.ranges.length; i += 2) { 4609 for (int ch = this.ranges[i]; ch <= this.ranges[i+1]; ch ++) { 4610 if (ch > 0xffff) 4611 uppers.addRange(ch, ch); 4612 else { 4613 char uch = Character.toUpperCase((char)ch); 4614 uppers.addRange(uch, uch); 4615 } 4616 } 4617 } 4618 RangeToken lowers = this.type == Token.RANGE ? Token.createRange() : Token.createNRange(); 4619 for (int i = 0; i < uppers.ranges.length; i += 2) { 4620 for (int ch = uppers.ranges[i]; ch <= uppers.ranges[i+1]; ch ++) { 4621 if (ch > 0xffff) 4622 lowers.addRange(ch, ch); 4623 else { 4624 char uch = Character.toUpperCase((char)ch); 4625 lowers.addRange(uch, uch); 4626 } 4627 } 4628 } 4629 lowers.mergeRanges(uppers); 4630 lowers.mergeRanges(this); 4631 lowers.compactRanges(); 4632 4633 this.icaseCache = lowers; 4634 return lowers; 4635 } 4636 4637 void dumpRanges() { 4638 System.err.print("RANGE: "); 4639 if (this.ranges == null) 4640 System.err.println(" NULL"); 4641 for (int i = 0; i < this.ranges.length; i += 2) { 4642 System.err.print("["+this.ranges[i]+","+this.ranges[i+1]+"] "); 4643 } 4644 System.err.println(""); 4645 } 4646 4647 boolean match(int ch) { 4648 if (this.map == null) this.createMap(); 4649 boolean ret; 4650 if (this.type == RANGE) { 4651 if (ch < MAPSIZE) 4652 return (this.map[ch/32] & (1<<(ch&0x1f))) != 0; 4653 ret = false; 4654 for (int i = this.nonMapIndex; i < this.ranges.length; i += 2) { 4655 if (this.ranges[i] <= ch && ch <= this.ranges[i+1]) 4656 return true; 4657 } 4658 } else { 4659 if (ch < MAPSIZE) 4660 return (this.map[ch/32] & (1<<(ch&0x1f))) == 0; 4661 ret = true; 4662 for (int i = this.nonMapIndex; i < this.ranges.length; i += 2) { 4663 if (this.ranges[i] <= ch && ch <= this.ranges[i+1]) 4664 return false; 4665 } 4666 } 4667 return ret; 4668 } 4669 4670 private static final int MAPSIZE = 256; 4671 private void createMap() { 4672 int asize = MAPSIZE/32; this.map = new int[asize]; 4674 this.nonMapIndex = this.ranges.length; 4675 for (int i = 0; i < asize; i ++) this.map[i] = 0; 4676 for (int i = 0; i < this.ranges.length; i += 2) { 4677 int s = this.ranges[i]; 4678 int e = this.ranges[i+1]; 4679 if (s < MAPSIZE) { 4680 for (int j = s; j <= e && j < MAPSIZE; j ++) 4681 this.map[j/32] |= 1<<(j&0x1f); } else { 4683 this.nonMapIndex = i; 4684 break; 4685 } 4686 if (e >= MAPSIZE) { 4687 this.nonMapIndex = i; 4688 break; 4689 } 4690 } 4691 } 4693 4694 public String toString(int options) { 4695 String ret; 4696 if (this.type == RANGE) { 4697 if (this == Token.token_dot) 4698 ret = "."; 4699 else if (this == Token.token_0to9) 4700 ret = "\\d"; 4701 else if (this == Token.token_wordchars) 4702 ret = "\\w"; 4703 else if (this == Token.token_spaces) 4704 ret = "\\s"; 4705 else { 4706 StringBuffer sb = new StringBuffer (); 4707 sb.append("["); 4708 for (int i = 0; i < this.ranges.length; i += 2) { 4709 if ((options & RegularExpression.SPECIAL_COMMA) != 0 && i > 0) sb.append(","); 4710 if (this.ranges[i] == this.ranges[i+1]) { 4711 sb.append(escapeCharInCharClass(this.ranges[i])); 4712 } else { 4713 sb.append(escapeCharInCharClass(this.ranges[i])); 4714 sb.append('-'); 4715 sb.append(escapeCharInCharClass(this.ranges[i+1])); 4716 } 4717 } 4718 sb.append("]"); 4719 ret = sb.toString(); 4720 } 4721 } else { 4722 if (this == Token.token_not_0to9) 4723 ret = "\\D"; 4724 else if (this == Token.token_not_wordchars) 4725 ret = "\\W"; 4726 else if (this == Token.token_not_spaces) 4727 ret = "\\S"; 4728 else { 4729 StringBuffer sb = new StringBuffer (); 4730 sb.append("[^"); 4731 for (int i = 0; i < this.ranges.length; i += 2) { 4732 if ((options & RegularExpression.SPECIAL_COMMA) != 0 && i > 0) sb.append(","); 4733 if (this.ranges[i] == this.ranges[i+1]) { 4734 sb.append(escapeCharInCharClass(this.ranges[i])); 4735 } else { 4736 sb.append(escapeCharInCharClass(this.ranges[i])); 4737 sb.append('-'); 4738 sb.append(escapeCharInCharClass(this.ranges[i+1])); 4739 } 4740 } 4741 sb.append("]"); 4742 ret = sb.toString(); 4743 } 4744 } 4745 return ret; 4746 } 4747 4748 private static String escapeCharInCharClass(int ch) { 4749 String ret; 4750 switch (ch) { 4751 case '[': case ']': case '-': case '^': 4752 case ',': case '\\': 4753 ret = "\\"+(char)ch; 4754 break; 4755 case '\f': ret = "\\f"; break; 4756 case '\n': ret = "\\n"; break; 4757 case '\r': ret = "\\r"; break; 4758 case '\t': ret = "\\t"; break; 4759 case 0x1b: ret = "\\e"; break; 4760 default: 4762 if (ch < 0x20) { 4763 String pre = "0"+Integer.toHexString(ch); 4764 ret = "\\x"+pre.substring(pre.length()-2, pre.length()); 4765 } else if (ch >= 0x10000) { 4766 String pre = "0"+Integer.toHexString(ch); 4767 ret = "\\v"+pre.substring(pre.length()-6, pre.length()); 4768 } else 4769 ret = ""+(char)ch; 4770 } 4771 return ret; 4772 } 4773 4774} 4775 4776 static class RegexParser { 4777 static final int T_CHAR = 0; 4778 static final int T_EOF = 1; 4779 static final int T_OR = 2; static final int T_STAR = 3; static final int T_PLUS = 4; static final int T_QUESTION = 5; static final int T_LPAREN = 6; static final int T_RPAREN = 7; static final int T_DOT = 8; static final int T_LBRACKET = 9; static final int T_BACKSOLIDUS = 10; static final int T_CARET = 11; static final int T_DOLLAR = 12; static final int T_LPAREN2 = 13; static final int T_LOOKAHEAD = 14; static final int T_NEGATIVELOOKAHEAD = 15; static final int T_LOOKBEHIND = 16; static final int T_NEGATIVELOOKBEHIND = 17; static final int T_INDEPENDENT = 18; static final int T_SET_OPERATIONS = 19; static final int T_POSIX_CHARCLASS_START = 20; static final int T_COMMENT = 21; static final int T_MODIFIERS = 22; static final int T_CONDITION = 23; static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; 4803 static class ReferencePosition { 4804 int refNumber; 4805 int position; 4806 ReferencePosition(int n, int pos) { 4807 this.refNumber = n; 4808 this.position = pos; 4809 } 4810 } 4811 4812 int offset; 4813 String regex; 4814 int regexlen; 4815 int options; 4816 ResourceBundle resources; 4817 int chardata; 4818 int nexttoken; 4819 static protected final int S_NORMAL = 0; 4820 static protected final int S_INBRACKETS = 1; 4821 static protected final int S_INXBRACKETS = 2; 4822 int context = S_NORMAL; 4823 int parennumber = 1; 4824 boolean hasBackReferences; 4825 Vector references = null; 4826 4827 public RegexParser() { 4828 } 4830 public RegexParser(Locale locale) { 4831 } 4833 4834 public void setLocale(Locale locale) { 4835 } 4836 4837 final ParseException ex(String key, int loc) { 4838 return new ParseException(EcorePlugin.INSTANCE.getString(key), loc); 4839 } 4840 4841 private final boolean isSet(int flag) { 4842 return (this.options & flag) == flag; 4843 } 4844 4845 synchronized Token parse(String regex, int options) throws ParseException { 4846 this.options = options; 4847 this.offset = 0; 4848 this.setContext(S_NORMAL); 4849 this.parennumber = 1; 4850 this.hasBackReferences = false; 4851 this.regex = regex; 4852 if (this.isSet(RegularExpression.EXTENDED_COMMENT)) 4853 this.regex = REUtil.stripExtendedComment(this.regex); 4854 this.regexlen = this.regex.length(); 4855 4856 4857 this.next(); 4858 Token ret = this.parseRegex(); 4859 if (this.offset != this.regexlen) 4860 throw ex("parser.parse.1", this.offset); 4861 if (this.references != null) { 4862 for (int i = 0; i < this.references.size(); i ++) { 4863 ReferencePosition position = (ReferencePosition)this.references.elementAt(i); 4864 if (this.parennumber <= position.refNumber) 4865 throw ex("parser.parse.2", position.position); 4866 } 4867 this.references.removeAllElements(); 4868 } 4869 return ret; 4870 } 4871 4872 4878 4879 protected final void setContext(int con) { 4880 this.context = con; 4881 } 4882 4883 final int read() { 4884 return this.nexttoken; 4885 } 4886 4887 final void next() { 4888 if (this.offset >= this.regexlen) { 4889 this.chardata = -1; 4890 this.nexttoken = T_EOF; 4891 return; 4892 } 4893 4894 int ret; 4895 int ch = this.regex.charAt(this.offset++); 4896 this.chardata = ch; 4897 4898 if (this.context == S_INBRACKETS) { 4899 switch (ch) { 4902 case '\\': 4903 ret = T_BACKSOLIDUS; 4904 if (this.offset >= this.regexlen) 4905 throw ex("parser.next.1", this.offset-1); 4906 this.chardata = this.regex.charAt(this.offset++); 4907 break; 4908 4909 case '-': 4910 if (this.isSet(RegularExpression.XMLSCHEMA_MODE) 4911 && this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') { 4912 this.offset++; 4913 ret = T_XMLSCHEMA_CC_SUBTRACTION; 4914 } else 4915 ret = T_CHAR; 4916 break; 4917 4918 case '[': 4919 if (!this.isSet(RegularExpression.XMLSCHEMA_MODE) 4920 && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') { 4921 this.offset++; 4922 ret = T_POSIX_CHARCLASS_START; 4923 break; 4924 } default: 4926 if (REUtil.isHighSurrogate(ch) && this.offset < this.regexlen) { 4927 int low = this.regex.charAt(this.offset); 4928 if (REUtil.isLowSurrogate(low)) { 4929 this.chardata = REUtil.composeFromSurrogates(ch, low); 4930 this.offset ++; 4931 } 4932 } 4933 ret = T_CHAR; 4934 } 4935 this.nexttoken = ret; 4936 return; 4937 } 4938 4939 switch (ch) { 4940 case '|': ret = T_OR; break; 4941 case '*': ret = T_STAR; break; 4942 case '+': ret = T_PLUS; break; 4943 case '?': ret = T_QUESTION; break; 4944 case ')': ret = T_RPAREN; break; 4945 case '.': ret = T_DOT; break; 4946 case '[': ret = T_LBRACKET; break; 4947 case '^': ret = T_CARET; break; 4948 case '$': ret = T_DOLLAR; break; 4949 case '(': 4950 ret = T_LPAREN; 4951 if (this.offset >= this.regexlen) 4952 break; 4953 if (this.regex.charAt(this.offset) != '?') 4954 break; 4955 if (++this.offset >= this.regexlen) 4956 throw ex("parser.next.2", this.offset-1); 4957 ch = this.regex.charAt(this.offset++); 4958 switch (ch) { 4959 case ':': ret = T_LPAREN2; break; 4960 case '=': ret = T_LOOKAHEAD; break; 4961 case '!': ret = T_NEGATIVELOOKAHEAD; break; 4962 case '[': ret = T_SET_OPERATIONS; break; 4963 case '>': ret = T_INDEPENDENT; break; 4964 case '<': 4965 if (this.offset >= this.regexlen) 4966 throw ex("parser.next.2", this.offset-3); 4967 ch = this.regex.charAt(this.offset++); 4968 if (ch == '=') { 4969 ret = T_LOOKBEHIND; 4970 } else if (ch == '!') { 4971 ret = T_NEGATIVELOOKBEHIND; 4972 } else 4973 throw ex("parser.next.3", this.offset-3); 4974 break; 4975 case '#': 4976 while (this.offset < this.regexlen) { 4977 ch = this.regex.charAt(this.offset++); 4978 if (ch == ')') break; 4979 } 4980 if (ch != ')') 4981 throw ex("parser.next.4", this.offset-1); 4982 ret = T_COMMENT; 4983 break; 4984 default: 4985 if (ch == '-' || 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z') { this.offset --; 4987 ret = T_MODIFIERS; 4988 break; 4989 } else if (ch == '(') { ret = T_CONDITION; break; 4992 } 4993 throw ex("parser.next.2", this.offset-2); 4994 } 4995 break; 4996 4997 case '\\': 4998 ret = T_BACKSOLIDUS; 4999 if (this.offset >= this.regexlen) 5000 throw ex("parser.next.1", this.offset-1); 5001 this.chardata = this.regex.charAt(this.offset++); 5002 break; 5003 5004 default: 5005 ret = T_CHAR; 5006 } 5007 this.nexttoken = ret; 5008 } 5009 5010 5019 Token parseRegex() throws ParseException { 5020 Token tok = this.parseTerm(); 5021 Token parent = null; 5022 while (this.read() == T_OR) { 5023 this.next(); if (parent == null) { 5025 parent = Token.createUnion(); 5026 parent.addChild(tok); 5027 tok = parent; 5028 } 5029 tok.addChild(this.parseTerm()); 5030 } 5031 return tok; 5032 } 5033 5034 5037 Token parseTerm() throws ParseException { 5038 int ch = this.read(); 5039 if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) { 5040 return Token.createEmpty(); 5041 } else { 5042 Token tok = this.parseFactor(); 5043 Token concat = null; 5044 while ((ch = this.read()) != T_OR && ch != T_RPAREN && ch != T_EOF) { 5045 if (concat == null) { 5046 concat = Token.createConcat(); 5047 concat.addChild(tok); 5048 tok = concat; 5049 } 5050 concat.addChild(this.parseFactor()); 5051 } 5053 return tok; 5054 } 5055 } 5056 5057 5059 Token processCaret() throws ParseException { 5060 this.next(); 5061 return Token.token_linebeginning; 5062 } 5063 Token processDollar() throws ParseException { 5064 this.next(); 5065 return Token.token_lineend; 5066 } 5067 Token processLookahead() throws ParseException { 5068 this.next(); 5069 Token tok = Token.createLook(Token.LOOKAHEAD, this.parseRegex()); 5070 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 5071 this.next(); return tok; 5073 } 5074 Token processNegativelookahead() throws ParseException { 5075 this.next(); 5076 Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this.parseRegex()); 5077 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 5078 this.next(); return tok; 5080 } 5081 Token processLookbehind() throws ParseException { 5082 this.next(); 5083 Token tok = Token.createLook(Token.LOOKBEHIND, this.parseRegex()); 5084 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 5085 this.next(); return tok; 5087 } 5088 Token processNegativelookbehind() throws ParseException { 5089 this.next(); 5090 Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this.parseRegex()); 5091 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 5092 this.next(); return tok; 5094 } 5095 Token processBacksolidus_A() throws ParseException { 5096 this.next(); 5097 return Token.token_stringbeginning; 5098 } 5099 Token processBacksolidus_Z() throws ParseException { 5100 this.next(); 5101 return Token.token_stringend2; 5102 } 5103 Token processBacksolidus_z() throws ParseException { 5104 this.next(); 5105 return Token.token_stringend; 5106 } 5107 Token processBacksolidus_b() throws ParseException { 5108 this.next(); 5109 return Token.token_wordedge; 5110 } 5111 Token processBacksolidus_B() throws ParseException { 5112 this.next(); 5113 return Token.token_not_wordedge; 5114 } 5115 Token processBacksolidus_lt() throws ParseException { 5116 this.next(); 5117 return Token.token_wordbeginning; 5118 } 5119 Token processBacksolidus_gt() throws ParseException { 5120 this.next(); 5121 return Token.token_wordend; 5122 } 5123 Token processStar(Token tok) throws ParseException { 5124 this.next(); 5125 if (this.read() == T_QUESTION) { 5126 this.next(); 5127 return Token.createNGClosure(tok); 5128 } else 5129 return Token.createClosure(tok); 5130 } 5131 Token processPlus(Token tok) throws ParseException { 5132 this.next(); 5134 if (this.read() == T_QUESTION) { 5135 this.next(); 5136 return Token.createConcat(tok, Token.createNGClosure(tok)); 5137 } else 5138 return Token.createConcat(tok, Token.createClosure(tok)); 5139 } 5140 Token processQuestion(Token tok) throws ParseException { 5141 this.next(); 5143 Token par = Token.createUnion(); 5144 if (this.read() == T_QUESTION) { 5145 this.next(); 5146 par.addChild(Token.createEmpty()); 5147 par.addChild(tok); 5148 } else { 5149 par.addChild(tok); 5150 par.addChild(Token.createEmpty()); 5151 } 5152 return par; 5153 } 5154 boolean checkQuestion(int off) { 5155 return off < this.regexlen && this.regex.charAt(off) == '?'; 5156 } 5157 Token processParen() throws ParseException { 5158 this.next(); 5159 int p = this.parennumber++; 5160 Token tok = Token.createParen(this.parseRegex(), p); 5161 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 5162 this.next(); return tok; 5164 } 5165 Token processParen2() throws ParseException { 5166 this.next(); 5167 Token tok = Token.createParen(this.parseRegex(), 0); 5168 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 5169 this.next(); return tok; 5171 } 5172 Token processCondition() throws ParseException { 5173 if (this.offset+1 >= this.regexlen) throw ex("parser.factor.4", this.offset); 5175 int refno = -1; 5177 Token condition = null; 5178 int ch = this.regex.charAt(this.offset); 5179 if ('1' <= ch && ch <= '9') { 5180 refno = ch-'0'; 5181 this.hasBackReferences = true; 5182 if (this.references == null) this.references = new Vector (); 5183 this.references.addElement(new ReferencePosition(refno, this.offset)); 5184 this.offset ++; 5185 if (this.regex.charAt(this.offset) != ')') throw ex("parser.factor.1", this.offset); 5186 this.offset ++; 5187 } else { 5188 if (ch == '?') this.offset --; this.next(); 5190 condition = this.parseFactor(); 5191 switch (condition.type) { 5192 case Token.LOOKAHEAD: 5193 case Token.NEGATIVELOOKAHEAD: 5194 case Token.LOOKBEHIND: 5195 case Token.NEGATIVELOOKBEHIND: 5196 break; 5197 case Token.ANCHOR: 5198 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 5199 break; 5200 default: 5201 throw ex("parser.factor.5", this.offset); 5202 } 5203 } 5204 this.next(); 5206 Token yesPattern = this.parseRegex(); 5207 Token noPattern = null; 5208 if (yesPattern.type == Token.UNION) { 5209 if (yesPattern.size() != 2) throw ex("parser.factor.6", this.offset); 5210 noPattern = yesPattern.getChild(1); 5211 yesPattern = yesPattern.getChild(0); 5212 } 5213 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 5214 this.next(); 5215 return Token.createCondition(refno, condition, yesPattern, noPattern); 5216 } 5217 Token processModifiers() throws ParseException { 5218 int add = 0, mask = 0, ch = -1; 5221 while (this.offset < this.regexlen) { 5222 ch = this.regex.charAt(this.offset); 5223 int v = REUtil.getOptionValue(ch); 5224 if (v == 0) break; add |= v; 5226 this.offset ++; 5227 } 5228 if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1); 5229 if (ch == '-') { 5230 this.offset ++; 5231 while (this.offset < this.regexlen) { 5232 ch = this.regex.charAt(this.offset); 5233 int v = REUtil.getOptionValue(ch); 5234 if (v == 0) break; mask |= v; 5236 this.offset ++; 5237 } 5238 if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1); 5239 } 5240 Token tok; 5241 if (ch == ':') { 5242 this.offset ++; 5243 this.next(); 5244 tok = Token.createModifierGroup(this.parseRegex(), add, mask); 5245 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 5246 this.next(); 5247 } else if (ch == ')') { this.offset ++; 5249 this.next(); 5250 tok = Token.createModifierGroup(this.parseRegex(), add, mask); 5251 } else 5252 throw ex("parser.factor.3", this.offset); 5253 5254 return tok; 5255 } 5256 Token processIndependent() throws ParseException { 5257 this.next(); 5258 Token tok = Token.createLook(Token.INDEPENDENT, this.parseRegex()); 5259 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 5260 this.next(); return tok; 5262 } 5263 Token processBacksolidus_c() throws ParseException { 5264 int ch2; if (this.offset >= this.regexlen 5266 || ((ch2 = this.regex.charAt(this.offset++)) & 0xffe0) != 0x0040) 5267 throw ex("parser.atom.1", this.offset-1); 5268 this.next(); 5269 return Token.createChar(ch2-0x40); 5270 } 5271 Token processBacksolidus_C() throws ParseException { 5272 throw ex("parser.process.1", this.offset); 5273 } 5274 Token processBacksolidus_i() throws ParseException { 5275 Token tok = Token.createChar('i'); 5276 this.next(); 5277 return tok; 5278 } 5279 Token processBacksolidus_I() throws ParseException { 5280 throw ex("parser.process.1", this.offset); 5281 } 5282 Token processBacksolidus_g() throws ParseException { 5283 this.next(); 5284 return Token.getGraphemePattern(); 5285 } 5286 Token processBacksolidus_X() throws ParseException { 5287 this.next(); 5288 return Token.getCombiningCharacterSequence(); 5289 } 5290 Token processBackreference() throws ParseException { 5291 int refnum = this.chardata-'0'; 5292 Token tok = Token.createBackReference(refnum); 5293 this.hasBackReferences = true; 5294 if (this.references == null) this.references = new Vector (); 5295 this.references.addElement(new ReferencePosition(refnum, this.offset-2)); 5296 this.next(); 5297 return tok; 5298 } 5299 5300 5302 5311 Token parseFactor() throws ParseException { 5312 int ch = this.read(); 5313 Token tok; 5314 switch (ch) { 5315 case T_CARET: return this.processCaret(); 5316 case T_DOLLAR: return this.processDollar(); 5317 case T_LOOKAHEAD: return this.processLookahead(); 5318 case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead(); 5319 case T_LOOKBEHIND: return this.processLookbehind(); 5320 case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind(); 5321 5322 case T_COMMENT: 5323 this.next(); 5324 return Token.createEmpty(); 5325 5326 case T_BACKSOLIDUS: 5327 switch (this.chardata) { 5328 case 'A': return this.processBacksolidus_A(); 5329 case 'Z': return this.processBacksolidus_Z(); 5330 case 'z': return this.processBacksolidus_z(); 5331 case 'b': return this.processBacksolidus_b(); 5332 case 'B': return this.processBacksolidus_B(); 5333 case '<': return this.processBacksolidus_lt(); 5334 case '>': return this.processBacksolidus_gt(); 5335 } 5336 } 5338 tok = this.parseAtom(); 5339 ch = this.read(); 5340 switch (ch) { 5341 case T_STAR: return this.processStar(tok); 5342 case T_PLUS: return this.processPlus(tok); 5343 case T_QUESTION: return this.processQuestion(tok); 5344 case T_CHAR: 5345 if (this.chardata == '{' && this.offset < this.regexlen) { 5346 5347 int off = this.offset; int min = 0, max = -1; 5349 5350 if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') { 5351 5352 min = ch -'0'; 5353 while (off < this.regexlen 5354 && (ch = this.regex.charAt(off++)) >= '0' && ch <= '9') { 5355 min = min*10 +ch-'0'; 5356 if (min < 0) 5357 throw ex("parser.quantifier.5", this.offset); 5358 } 5359 } 5360 else { 5361 throw ex("parser.quantifier.1", this.offset); 5362 } 5363 5364 max = min; 5365 if (ch == ',') { 5366 5367 if (off >= this.regexlen) { 5368 throw ex("parser.quantifier.3", this.offset); 5369 } 5370 else if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') { 5371 5372 max = ch -'0'; while (off < this.regexlen 5374 && (ch = this.regex.charAt(off++)) >= '0' 5375 && ch <= '9') { 5376 max = max*10 +ch-'0'; 5377 if (max < 0) 5378 throw ex("parser.quantifier.5", this.offset); 5379 } 5380 5381 if (min > max) 5382 throw ex("parser.quantifier.4", this.offset); 5383 } 5384 else { max = -1; 5386 } 5387 } 5388 5389 if (ch != '}') 5390 throw ex("parser.quantifier.2", this.offset); 5391 5392 if (this.checkQuestion(off)) { tok = Token.createNGClosure(tok); 5394 this.offset = off+1; 5395 } else { 5396 tok = Token.createClosure(tok); 5397 this.offset = off; 5398 } 5399 5400 tok.setMin(min); 5401 tok.setMax(max); 5402 this.next(); 5404 } 5405 } 5406 return tok; 5407 } 5408 5409 5415 Token parseAtom() throws ParseException { 5416 int ch = this.read(); 5417 Token tok = null; 5418 switch (ch) { 5419 case T_LPAREN: return this.processParen(); 5420 case T_LPAREN2: return this.processParen2(); case T_CONDITION: return this.processCondition(); case T_MODIFIERS: return this.processModifiers(); case T_INDEPENDENT: return this.processIndependent(); 5424 case T_DOT: 5425 this.next(); tok = Token.token_dot; 5427 break; 5428 5429 5436 case T_LBRACKET: return this.parseCharacterClass(true); 5437 case T_SET_OPERATIONS: return this.parseSetOperations(); 5438 5439 case T_BACKSOLIDUS: 5440 switch (this.chardata) { 5441 case 'd': case 'D': 5442 case 'w': case 'W': 5443 case 's': case 'S': 5444 tok = this.getTokenForShorthand(this.chardata); 5445 this.next(); 5446 return tok; 5447 5448 case 'e': case 'f': case 'n': case 'r': 5449 case 't': case 'u': case 'v': case 'x': 5450 { 5451 int ch2 = this.decodeEscaped(); 5452 if (ch2 < 0x10000) { 5453 tok = Token.createChar(ch2); 5454 } else { 5455 tok = Token.createString(REUtil.decomposeToSurrogates(ch2)); 5456 } 5457 } 5458 break; 5459 5460 case 'c': return this.processBacksolidus_c(); 5461 case 'C': return this.processBacksolidus_C(); 5462 case 'i': return this.processBacksolidus_i(); 5463 case 'I': return this.processBacksolidus_I(); 5464 case 'g': return this.processBacksolidus_g(); 5465 case 'X': return this.processBacksolidus_X(); 5466 case '1': case '2': case '3': case '4': 5467 case '5': case '6': case '7': case '8': case '9': 5468 return this.processBackreference(); 5469 5470 case 'P': 5471 case 'p': 5472 int pstart = this.offset; 5473 tok = processBacksolidus_pP(this.chardata); 5474 if (tok == null) throw this.ex("parser.atom.5", pstart); 5475 break; 5476 5477 default: 5478 tok = Token.createChar(this.chardata); 5479 } 5480 this.next(); 5481 break; 5482 5483 case T_CHAR: 5484 if (this.chardata == ']' || this.chardata == '{' || this.chardata == '}') 5485 throw this.ex("parser.atom.4", this.offset-1); 5486 tok = Token.createChar(this.chardata); 5487 int high = this.chardata; 5488 this.next(); 5489 if (REUtil.isHighSurrogate(high) 5490 && this.read() == T_CHAR && REUtil.isLowSurrogate(this.chardata)) { 5491 char[] sur = new char[2]; 5492 sur[0] = (char)high; 5493 sur[1] = (char)this.chardata; 5494 tok = Token.createParen(Token.createString(new String (sur)), 0); 5495 this.next(); 5496 } 5497 break; 5498 5499 default: 5500 throw this.ex("parser.atom.4", this.offset-1); 5501 } 5502 return tok; 5503 } 5504 5505 protected RangeToken processBacksolidus_pP(int c) throws ParseException { 5506 5507 this.next(); 5508 if (this.read() != T_CHAR || this.chardata != '{') 5509 throw this.ex("parser.atom.2", this.offset-1); 5510 5511 boolean positive = c == 'p'; 5513 int namestart = this.offset; 5514 int nameend = this.regex.indexOf('}', namestart); 5515 5516 if (nameend < 0) 5517 throw this.ex("parser.atom.3", this.offset); 5518 5519 String pname = this.regex.substring(namestart, nameend); 5520 this.offset = nameend+1; 5521 5522 return Token.getRange(pname, positive, this.isSet(RegularExpression.XMLSCHEMA_MODE)); 5523 } 5524 5525 int processCIinCharacterClass(RangeToken tok, int c) { 5526 return this.decodeEscaped(); 5527 } 5528 5529 5536 protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException { 5537 this.setContext(S_INBRACKETS); 5538 this.next(); boolean nrange = false; 5540 RangeToken base = null; 5541 RangeToken tok; 5542 if (this.read() == T_CHAR && this.chardata == '^') { 5543 nrange = true; 5544 this.next(); if (useNrange) { 5546 tok = Token.createNRange(); 5547 } else { 5548 base = Token.createRange(); 5549 base.addRange(0, Token.UTF16_MAX); 5550 tok = Token.createRange(); 5551 } 5552 } else { 5553 tok = Token.createRange(); 5554 } 5555 int type; 5556 boolean firstloop = true; 5557 while ((type = this.read()) != T_EOF) { 5558 if (type == T_CHAR && this.chardata == ']' && !firstloop) 5559 break; 5560 firstloop = false; 5561 int c = this.chardata; 5562 boolean end = false; 5563 if (type == T_BACKSOLIDUS) { 5564 switch (c) { 5565 case 'd': case 'D': 5566 case 'w': case 'W': 5567 case 's': case 'S': 5568 tok.mergeRanges(this.getTokenForShorthand(c)); 5569 end = true; 5570 break; 5571 5572 case 'i': case 'I': 5573 case 'c': case 'C': 5574 c = this.processCIinCharacterClass(tok, c); 5575 if (c < 0) end = true; 5576 break; 5577 5578 case 'p': 5579 case 'P': 5580 int pstart = this.offset; 5581 RangeToken tok2 = this.processBacksolidus_pP(c); 5582 if (tok2 == null) throw this.ex("parser.atom.5", pstart); 5583 tok.mergeRanges(tok2); 5584 end = true; 5585 break; 5586 5587 default: 5588 c = this.decodeEscaped(); 5589 } } else if (type == T_POSIX_CHARCLASS_START) { 5593 int nameend = this.regex.indexOf(':', this.offset); 5594 if (nameend < 0) throw this.ex("parser.cc.1", this.offset); 5595 boolean positive = true; 5596 if (this.regex.charAt(this.offset) == '^') { 5597 this.offset ++; 5598 positive = false; 5599 } 5600 String name = this.regex.substring(this.offset, nameend); 5601 RangeToken range = Token.getRange(name, positive, 5602 this.isSet(RegularExpression.XMLSCHEMA_MODE)); 5603 if (range == null) throw this.ex("parser.cc.3", this.offset); 5604 tok.mergeRanges(range); 5605 end = true; 5606 if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']') 5607 throw this.ex("parser.cc.1", nameend); 5608 this.offset = nameend+2; 5609 } 5610 this.next(); 5611 if (!end) { if (this.read() != T_CHAR || this.chardata != '-') { tok.addRange(c, c); 5614 } else { 5615 this.next(); if ((type = this.read()) == T_EOF) throw this.ex("parser.cc.2", this.offset); 5617 if (type == T_CHAR && this.chardata == ']') { 5618 tok.addRange(c, c); 5619 tok.addRange('-', '-'); 5620 } else { 5621 int rangeend = this.chardata; 5622 if (type == T_BACKSOLIDUS) 5623 rangeend = this.decodeEscaped(); 5624 this.next(); 5625 tok.addRange(c, rangeend); 5626 } 5627 } 5628 } 5629 if (this.isSet(RegularExpression.SPECIAL_COMMA) 5630 && this.read() == T_CHAR && this.chardata == ',') 5631 this.next(); 5632 } 5633 if (this.read() == T_EOF) 5634 throw this.ex("parser.cc.2", this.offset); 5635 if (!useNrange && nrange) { 5636 base.subtractRanges(tok); 5637 tok = base; 5638 } 5639 tok.sortRanges(); 5640 tok.compactRanges(); 5641 5646 this.setContext(S_NORMAL); 5647 this.next(); 5649 return tok; 5650 } 5651 5652 5655 protected RangeToken parseSetOperations() throws ParseException { 5656 RangeToken tok = this.parseCharacterClass(false); 5657 int type; 5658 while ((type = this.read()) != T_RPAREN) { 5659 int ch = this.chardata; 5660 if (type == T_CHAR && (ch == '-' || ch == '&') 5661 || type == T_PLUS) { 5662 this.next(); 5663 if (this.read() != T_LBRACKET) throw ex("parser.ope.1", this.offset-1); 5664 RangeToken t2 = this.parseCharacterClass(false); 5665 if (type == T_PLUS) 5666 tok.mergeRanges(t2); 5667 else if (ch == '-') 5668 tok.subtractRanges(t2); 5669 else if (ch == '&') 5670 tok.intersectRanges(t2); 5671 else 5672 throw new RuntimeException ("ASSERT"); 5673 } else { 5674 throw ex("parser.ope.2", this.offset-1); 5675 } 5676 } 5677 this.next(); 5678 return tok; 5679 } 5680 5681 Token getTokenForShorthand(int ch) { 5682 Token tok; 5683 switch (ch) { 5684 case 'd': 5685 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 5686 ? Token.getRange("Nd", true) : Token.token_0to9; 5687 break; 5688 case 'D': 5689 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 5690 ? Token.getRange("Nd", false) : Token.token_not_0to9; 5691 break; 5692 case 'w': 5693 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 5694 ? Token.getRange("IsWord", true) : Token.token_wordchars; 5695 break; 5696 case 'W': 5697 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 5698 ? Token.getRange("IsWord", false) : Token.token_not_wordchars; 5699 break; 5700 case 's': 5701 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 5702 ? Token.getRange("IsSpace", true) : Token.token_spaces; 5703 break; 5704 case 'S': 5705 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 5706 ? Token.getRange("IsSpace", false) : Token.token_not_spaces; 5707 break; 5708 5709 default: 5710 throw new RuntimeException ("Internal Error: shorthands: \\u"+Integer.toString(ch, 16)); 5711 } 5712 return tok; 5713 } 5714 5715 5717 int decodeEscaped() throws ParseException { 5718 if (this.read() != T_BACKSOLIDUS) throw ex("parser.next.1", this.offset-1); 5719 int c = this.chardata; 5720 switch (c) { 5721 case 'e': c = 0x1b; break; case 'f': c = '\f'; break; case 'n': c = '\n'; break; case 'r': c = '\r'; break; case 't': c = '\t'; break; case 'x': 5728 this.next(); 5729 if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1); 5730 if (this.chardata == '{') { 5731 int v1 = 0; 5732 int uv = 0; 5733 do { 5734 this.next(); 5735 if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1); 5736 if ((v1 = hexChar(this.chardata)) < 0) 5737 break; 5738 if (uv > uv*16) throw ex("parser.descape.2", this.offset-1); 5739 uv = uv*16+v1; 5740 } while (true); 5741 if (this.chardata != '}') throw ex("parser.descape.3", this.offset-1); 5742 if (uv > Token.UTF16_MAX) throw ex("parser.descape.4", this.offset-1); 5743 c = uv; 5744 } else { 5745 int v1 = 0; 5746 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 5747 throw ex("parser.descape.1", this.offset-1); 5748 int uv = v1; 5749 this.next(); 5750 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 5751 throw ex("parser.descape.1", this.offset-1); 5752 uv = uv*16+v1; 5753 c = uv; 5754 } 5755 break; 5756 5757 case 'u': 5758 int v1 = 0; 5759 this.next(); 5760 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 5761 throw ex("parser.descape.1", this.offset-1); 5762 int uv = v1; 5763 this.next(); 5764 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 5765 throw ex("parser.descape.1", this.offset-1); 5766 uv = uv*16+v1; 5767 this.next(); 5768 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 5769 throw ex("parser.descape.1", this.offset-1); 5770 uv = uv*16+v1; 5771 this.next(); 5772 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 5773 throw ex("parser.descape.1", this.offset-1); 5774 uv = uv*16+v1; 5775 c = uv; 5776 break; 5777 5778 case 'v': 5779 this.next(); 5780 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 5781 throw ex("parser.descape.1", this.offset-1); 5782 uv = v1; 5783 this.next(); 5784 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 5785 throw ex("parser.descape.1", this.offset-1); 5786 uv = uv*16+v1; 5787 this.next(); 5788 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 5789 throw ex("parser.descape.1", this.offset-1); 5790 uv = uv*16+v1; 5791 this.next(); 5792 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 5793 throw ex("parser.descape.1", this.offset-1); 5794 uv = uv*16+v1; 5795 this.next(); 5796 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 5797 throw ex("parser.descape.1", this.offset-1); 5798 uv = uv*16+v1; 5799 this.next(); 5800 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 5801 throw ex("parser.descape.1", this.offset-1); 5802 uv = uv*16+v1; 5803 if (uv > Token.UTF16_MAX) throw ex("parser.descappe.4", this.offset-1); 5804 c = uv; 5805 break; 5806 case 'A': 5807 case 'Z': 5808 case 'z': 5809 throw ex("parser.descape.5", this.offset-2); 5810 default: 5811 } 5812 return c; 5813 } 5814 5815 static private final int hexChar(int ch) { 5816 if (ch < '0') return -1; 5817 if (ch > 'f') return -1; 5818 if (ch <= '9') return ch-'0'; 5819 if (ch < 'A') return -1; 5820 if (ch <= 'F') return ch-'A'+10; 5821 if (ch < 'a') return -1; 5822 return ch-'a'+10; 5823 } 5824 } 5825 5826 5827 static class Token implements java.io.Serializable { 5828 static final boolean COUNTTOKENS = true; 5829 static int tokens = 0; 5830 5831 static final int CHAR = 0; static final int DOT = 11; static final int CONCAT = 1; static final int UNION = 2; static final int CLOSURE = 3; static final int RANGE = 4; static final int NRANGE = 5; static final int PAREN = 6; static final int EMPTY = 7; static final int ANCHOR = 8; static final int NONGREEDYCLOSURE = 9; static final int STRING = 10; static final int BACKREFERENCE = 12; static final int LOOKAHEAD = 20; static final int NEGATIVELOOKAHEAD = 21; static final int LOOKBEHIND = 22; static final int NEGATIVELOOKBEHIND = 23; static final int INDEPENDENT = 24; static final int MODIFIERGROUP = 25; static final int CONDITION = 26; 5852 static final int UTF16_MAX = 0x10ffff; 5853 5854 int type; 5855 5856 static Token token_dot; 5857 static Token token_0to9; 5858 static Token token_wordchars; 5859 static Token token_not_0to9; 5860 static Token token_not_wordchars; 5861 static Token token_spaces; 5862 static Token token_not_spaces; 5863 static Token token_empty; 5864 static Token token_linebeginning; 5865 static Token token_linebeginning2; 5866 static Token token_lineend; 5867 static Token token_stringbeginning; 5868 static Token token_stringend; 5869 static Token token_stringend2; 5870 static Token token_wordedge; 5871 static Token token_not_wordedge; 5872 static Token token_wordbeginning; 5873 static Token token_wordend; 5874 static { 5875 Token.token_empty = new Token(Token.EMPTY); 5876 5877 Token.token_linebeginning = Token.createAnchor('^'); 5878 Token.token_linebeginning2 = Token.createAnchor('@'); 5879 Token.token_lineend = Token.createAnchor('$'); 5880 Token.token_stringbeginning = Token.createAnchor('A'); 5881 Token.token_stringend = Token.createAnchor('z'); 5882 Token.token_stringend2 = Token.createAnchor('Z'); 5883 Token.token_wordedge = Token.createAnchor('b'); 5884 Token.token_not_wordedge = Token.createAnchor('B'); 5885 Token.token_wordbeginning = Token.createAnchor('<'); 5886 Token.token_wordend = Token.createAnchor('>'); 5887 5888 Token.token_dot = new Token(Token.DOT); 5889 5890 Token.token_0to9 = Token.createRange(); 5891 Token.token_0to9.addRange('0', '9'); 5892 Token.token_wordchars = Token.createRange(); 5893 Token.token_wordchars.addRange('0', '9'); 5894 Token.token_wordchars.addRange('A', 'Z'); 5895 Token.token_wordchars.addRange('_', '_'); 5896 Token.token_wordchars.addRange('a', 'z'); 5897 Token.token_spaces = Token.createRange(); 5898 Token.token_spaces.addRange('\t', '\t'); 5899 Token.token_spaces.addRange('\n', '\n'); 5900 Token.token_spaces.addRange('\f', '\f'); 5901 Token.token_spaces.addRange('\r', '\r'); 5902 Token.token_spaces.addRange(' ', ' '); 5903 5904 Token.token_not_0to9 = Token.complementRanges(Token.token_0to9); 5905 Token.token_not_wordchars = Token.complementRanges(Token.token_wordchars); 5906 Token.token_not_spaces = Token.complementRanges(Token.token_spaces); 5907 } 5908 5909 static Token.ParenToken createLook(int type, Token child) { 5910 if (COUNTTOKENS) Token.tokens ++; 5911 return new Token.ParenToken(type, child, 0); 5912 } 5913 static Token.ParenToken createParen(Token child, int pnumber) { 5914 if (COUNTTOKENS) Token.tokens ++; 5915 return new Token.ParenToken(Token.PAREN, child, pnumber); 5916 } 5917 static Token.ClosureToken createClosure(Token tok) { 5918 if (COUNTTOKENS) Token.tokens ++; 5919 return new Token.ClosureToken(Token.CLOSURE, tok); 5920 } 5921 static Token.ClosureToken createNGClosure(Token tok) { 5922 if (COUNTTOKENS) Token.tokens ++; 5923 return new Token.ClosureToken(Token.NONGREEDYCLOSURE, tok); 5924 } 5925 static Token.ConcatToken createConcat(Token tok1, Token tok2) { 5926 if (COUNTTOKENS) Token.tokens ++; 5927 return new Token.ConcatToken(tok1, tok2); 5928 } 5929 static Token.UnionToken createConcat() { 5930 if (COUNTTOKENS) Token.tokens ++; 5931 return new Token.UnionToken(Token.CONCAT); } 5933 static Token.UnionToken createUnion() { 5934 if (COUNTTOKENS) Token.tokens ++; 5935 return new Token.UnionToken(Token.UNION); 5936 } 5937 static Token createEmpty() { 5938 return Token.token_empty; 5939 } 5940 static RangeToken createRange() { 5941 if (COUNTTOKENS) Token.tokens ++; 5942 return new RangeToken(Token.RANGE); 5943 } 5944 static RangeToken createNRange() { 5945 if (COUNTTOKENS) Token.tokens ++; 5946 return new RangeToken(Token.NRANGE); 5947 } 5948 static Token.CharToken createChar(int ch) { 5949 if (COUNTTOKENS) Token.tokens ++; 5950 return new Token.CharToken(Token.CHAR, ch); 5951 } 5952 static private Token.CharToken createAnchor(int ch) { 5953 if (COUNTTOKENS) Token.tokens ++; 5954 return new Token.CharToken(Token.ANCHOR, ch); 5955 } 5956 static Token.StringToken createBackReference(int refno) { 5957 if (COUNTTOKENS) Token.tokens ++; 5958 return new Token.StringToken(Token.BACKREFERENCE, null, refno); 5959 } 5960 static Token.StringToken createString(String str) { 5961 if (COUNTTOKENS) Token.tokens ++; 5962 return new Token.StringToken(Token.STRING, str, 0); 5963 } 5964 static Token.ModifierToken createModifierGroup(Token child, int add, int mask) { 5965 if (COUNTTOKENS) Token.tokens ++; 5966 return new Token.ModifierToken(child, add, mask); 5967 } 5968 static Token.ConditionToken createCondition(int refno, Token condition, 5969 Token yespat, Token nopat) { 5970 if (COUNTTOKENS) Token.tokens ++; 5971 return new Token.ConditionToken(refno, condition, yespat, nopat); 5972 } 5973 5974 protected Token(int type) { 5975 this.type = type; 5976 } 5977 5978 5981 int size() { 5982 return 0; 5983 } 5984 Token getChild(int index) { 5985 return null; 5986 } 5987 void addChild(Token tok) { 5988 throw new RuntimeException ("Not supported."); 5989 } 5990 5991 protected void addRange(int start, int end) { 5993 throw new RuntimeException ("Not supported."); 5994 } 5995 protected void sortRanges() { 5996 throw new RuntimeException ("Not supported."); 5997 } 5998 protected void compactRanges() { 5999 throw new RuntimeException ("Not supported."); 6000 } 6001 protected void mergeRanges(Token tok) { 6002 throw new RuntimeException ("Not supported."); 6003 } 6004 protected void subtractRanges(Token tok) { 6005 throw new RuntimeException ("Not supported."); 6006 } 6007 protected void intersectRanges(Token tok) { 6008 throw new RuntimeException ("Not supported."); 6009 } 6010 static Token complementRanges(Token tok) { 6011 return RangeToken.complementRanges(tok); 6012 } 6013 6014 6015 void setMin(int min) { } 6017 void setMax(int max) { } 6019 int getMin() { return -1; 6021 } 6022 int getMax() { return -1; 6024 } 6025 int getReferenceNumber() { return 0; 6027 } 6028 String getString() { return null; 6030 } 6031 6032 int getParenNumber() { 6033 return 0; 6034 } 6035 int getChar() { 6036 return -1; 6037 } 6038 6039 public String toString() { 6040 return this.toString(0); 6041 } 6042 public String toString(int options) { 6043 return this.type == Token.DOT ? "." : ""; 6044 } 6045 6046 6049 final int getMinLength() { 6050 switch (this.type) { 6051 case CONCAT: 6052 int sum = 0; 6053 for (int i = 0; i < this.size(); i ++) 6054 sum += this.getChild(i).getMinLength(); 6055 return sum; 6056 6057 case CONDITION: 6058 case UNION: 6059 if (this.size() == 0) 6060 return 0; 6061 int ret = this.getChild(0).getMinLength(); 6062 for (int i = 1; i < this.size(); i ++) { 6063 int min = this.getChild(i).getMinLength(); 6064 if (min < ret) ret = min; 6065 } 6066 return ret; 6067 6068 case CLOSURE: 6069 case NONGREEDYCLOSURE: 6070 if (this.getMin() >= 0) 6071 return this.getMin() * this.getChild(0).getMinLength(); 6072 return 0; 6073 6074 case EMPTY: 6075 case ANCHOR: 6076 return 0; 6077 6078 case DOT: 6079 case CHAR: 6080 case RANGE: 6081 case NRANGE: 6082 return 1; 6083 6084 case INDEPENDENT: 6085 case PAREN: 6086 case MODIFIERGROUP: 6087 return this.getChild(0).getMinLength(); 6088 6089 case BACKREFERENCE: 6090 return 0; 6092 case STRING: 6093 return this.getString().length(); 6094 6095 case LOOKAHEAD: 6096 case NEGATIVELOOKAHEAD: 6097 case LOOKBEHIND: 6098 case NEGATIVELOOKBEHIND: 6099 return 0; 6101 default: 6102 throw new RuntimeException ("Token#getMinLength(): Invalid Type: "+this.type); 6103 } 6104 } 6105 6106 final int getMaxLength() { 6107 switch (this.type) { 6108 case CONCAT: 6109 int sum = 0; 6110 for (int i = 0; i < this.size(); i ++) { 6111 int d = this.getChild(i).getMaxLength(); 6112 if (d < 0) return -1; 6113 sum += d; 6114 } 6115 return sum; 6116 6117 case CONDITION: 6118 case UNION: 6119 if (this.size() == 0) 6120 return 0; 6121 int ret = this.getChild(0).getMaxLength(); 6122 for (int i = 1; ret >= 0 && i < this.size(); i ++) { 6123 int max = this.getChild(i).getMaxLength(); 6124 if (max < 0) { ret = -1; 6126 break; 6127 } 6128 if (max > ret) ret = max; 6129 } 6130 return ret; 6131 6132 case CLOSURE: 6133 case NONGREEDYCLOSURE: 6134 if (this.getMax() >= 0) 6135 return this.getMax() * this.getChild(0).getMaxLength(); 6138 return -1; 6139 6140 case EMPTY: 6141 case ANCHOR: 6142 return 0; 6143 6144 case CHAR: 6145 return 1; 6146 case DOT: 6147 case RANGE: 6148 case NRANGE: 6149 return 2; 6150 6151 case INDEPENDENT: 6152 case PAREN: 6153 case MODIFIERGROUP: 6154 return this.getChild(0).getMaxLength(); 6155 6156 case BACKREFERENCE: 6157 return -1; 6159 case STRING: 6160 return this.getString().length(); 6161 6162 case LOOKAHEAD: 6163 case NEGATIVELOOKAHEAD: 6164 case LOOKBEHIND: 6165 case NEGATIVELOOKBEHIND: 6166 return 0; 6168 default: 6169 throw new RuntimeException ("Token#getMaxLength(): Invalid Type: "+this.type); 6170 } 6171 } 6172 6173 static final int FC_CONTINUE = 0; 6174 static final int FC_TERMINAL = 1; 6175 static final int FC_ANY = 2; 6176 private static final boolean isSet(int options, int flag) { 6177 return (options & flag) == flag; 6178 } 6179 final int analyzeFirstCharacter(RangeToken result, int options) { 6180 switch (this.type) { 6181 case CONCAT: 6182 int ret = FC_CONTINUE; 6183 for (int i = 0; i < this.size(); i ++) 6184 if ((ret = this.getChild(i).analyzeFirstCharacter(result, options)) != FC_CONTINUE) 6185 break; 6186 return ret; 6187 6188 case UNION: 6189 if (this.size() == 0) 6190 return FC_CONTINUE; 6191 6196 int ret2 = FC_CONTINUE; 6197 boolean hasEmpty = false; 6198 for (int i = 0; i < this.size(); i ++) { 6199 ret2 = this.getChild(i).analyzeFirstCharacter(result, options); 6200 if (ret2 == FC_ANY) 6201 break; 6202 else if (ret2 == FC_CONTINUE) 6203 hasEmpty = true; 6204 } 6205 return hasEmpty ? FC_CONTINUE : ret2; 6206 6207 case CONDITION: 6208 int ret3 = this.getChild(0).analyzeFirstCharacter(result, options); 6209 if (this.size() == 1) return FC_CONTINUE; 6210 if (ret3 == FC_ANY) return ret3; 6211 int ret4 = this.getChild(1).analyzeFirstCharacter(result, options); 6212 if (ret4 == FC_ANY) return ret4; 6213 return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE : FC_TERMINAL; 6214 6215 case CLOSURE: 6216 case NONGREEDYCLOSURE: 6217 this.getChild(0).analyzeFirstCharacter(result, options); 6218 return FC_CONTINUE; 6219 6220 case EMPTY: 6221 case ANCHOR: 6222 return FC_CONTINUE; 6223 6224 case CHAR: 6225 int ch = this.getChar(); 6226 result.addRange(ch, ch); 6227 if (ch < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) { 6228 ch = Character.toUpperCase((char)ch); 6229 result.addRange(ch, ch); 6230 ch = Character.toLowerCase((char)ch); 6231 result.addRange(ch, ch); 6232 } 6233 return FC_TERMINAL; 6234 6235 case DOT: if (isSet(options, RegularExpression.SINGLE_LINE)) { 6237 return FC_CONTINUE; } else { 6239 return FC_CONTINUE; 6240 6248 } 6249 6250 case RANGE: 6251 if (isSet(options, RegularExpression.IGNORE_CASE)) { 6252 result.mergeRanges(((RangeToken)this).getCaseInsensitiveToken()); 6253 } else { 6254 result.mergeRanges(this); 6255 } 6256 return FC_TERMINAL; 6257 6258 case NRANGE: if (isSet(options, RegularExpression.IGNORE_CASE)) { 6260 result.mergeRanges(Token.complementRanges(((RangeToken)this).getCaseInsensitiveToken())); 6261 } else { 6262 result.mergeRanges(Token.complementRanges(this)); 6263 } 6264 return FC_TERMINAL; 6265 6266 case INDEPENDENT: 6267 case PAREN: 6268 return this.getChild(0).analyzeFirstCharacter(result, options); 6269 6270 case MODIFIERGROUP: 6271 options |= ((ModifierToken)this).getOptions(); 6272 options &= ~((ModifierToken)this).getOptionsMask(); 6273 return this.getChild(0).analyzeFirstCharacter(result, options); 6274 6275 case BACKREFERENCE: 6276 result.addRange(0, UTF16_MAX); return FC_ANY; 6278 6279 case STRING: 6280 int cha = this.getString().charAt(0); 6281 int ch2; 6282 if (REUtil.isHighSurrogate(cha) 6283 && this.getString().length() >= 2 6284 && REUtil.isLowSurrogate((ch2 = this.getString().charAt(1)))) 6285 cha = REUtil.composeFromSurrogates(cha, ch2); 6286 result.addRange(cha, cha); 6287 if (cha < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) { 6288 cha = Character.toUpperCase((char)cha); 6289 result.addRange(cha, cha); 6290 cha = Character.toLowerCase((char)cha); 6291 result.addRange(cha, cha); 6292 } 6293 return FC_TERMINAL; 6294 6295 case LOOKAHEAD: 6296 case NEGATIVELOOKAHEAD: 6297 case LOOKBEHIND: 6298 case NEGATIVELOOKBEHIND: 6299 return FC_CONTINUE; 6300 6301 default: 6302 throw new RuntimeException ("Token#analyzeHeadCharacter(): Invalid Type: "+this.type); 6303 } 6304 } 6305 6306 private final boolean isShorterThan(Token tok) { 6307 if (tok == null) return false; 6308 6318 int mylength; 6319 if (this.type == STRING) mylength = this.getString().length(); 6320 else throw new RuntimeException ("Internal Error: Illegal type: "+this.type); 6321 int otherlength; 6322 if (tok.type == STRING) otherlength = tok.getString().length(); 6323 else throw new RuntimeException ("Internal Error: Illegal type: "+tok.type); 6324 return mylength < otherlength; 6325 } 6326 6327 static class FixedStringContainer { 6328 Token token = null; 6329 int options = 0; 6330 FixedStringContainer() { 6331 } 6332 } 6333 6334 final void findFixedString(FixedStringContainer container, int options) { 6335 switch (this.type) { 6336 case CONCAT: 6337 Token prevToken = null; 6338 int prevOptions = 0; 6339 for (int i = 0; i < this.size(); i ++) { 6340 this.getChild(i).findFixedString(container, options); 6341 if (prevToken == null || prevToken.isShorterThan(container.token)) { 6342 prevToken = container.token; 6343 prevOptions = container.options; 6344 } 6345 } 6346 container.token = prevToken; 6347 container.options = prevOptions; 6348 return; 6349 6350 case UNION: 6351 case CLOSURE: 6352 case NONGREEDYCLOSURE: 6353 case EMPTY: 6354 case ANCHOR: 6355 case RANGE: 6356 case DOT: 6357 case NRANGE: 6358 case BACKREFERENCE: 6359 case LOOKAHEAD: 6360 case NEGATIVELOOKAHEAD: 6361 case LOOKBEHIND: 6362 case NEGATIVELOOKBEHIND: 6363 case CONDITION: 6364 container.token = null; 6365 return; 6366 6367 case CHAR: container.token = null; return; 6371 case STRING: 6372 container.token = this; 6373 container.options = options; 6374 return; 6375 6376 case INDEPENDENT: 6377 case PAREN: 6378 this.getChild(0).findFixedString(container, options); 6379 return; 6380 6381 case MODIFIERGROUP: 6382 options |= ((ModifierToken)this).getOptions(); 6383 options &= ~((ModifierToken)this).getOptionsMask(); 6384 this.getChild(0).findFixedString(container, options); 6385 return; 6386 6387 default: 6388 throw new RuntimeException ("Token#findFixedString(): Invalid Type: "+this.type); 6389 } 6390 } 6391 6392 boolean match(int ch) { 6393 throw new RuntimeException ("NFAArrow#match(): Internal error: "+this.type); 6394 } 6395 6396 private final static Hashtable categories = new Hashtable (); 6398 private final static Hashtable categories2 = new Hashtable (); 6399 private static final String [] categoryNames = { 6400 "Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd", 6401 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", null, "Co", "Cs", 6402 "Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", "Pi", "Pf", "L", "M", "N", "Z", "C", "P", "S", }; 6406 6407 static final int CHAR_INIT_QUOTE = 29; static final int CHAR_FINAL_QUOTE = 30; static final int CHAR_LETTER = 31; 6411 static final int CHAR_MARK = 32; 6412 static final int CHAR_NUMBER = 33; 6413 static final int CHAR_SEPARATOR = 34; 6414 static final int CHAR_OTHER = 35; 6415 static final int CHAR_PUNCTUATION = 36; 6416 static final int CHAR_SYMBOL = 37; 6417 6418 private static final String [] blockNames = { 6420 "Basic Latin", 6421 "Latin-1 Supplement", 6422 "Latin Extended-A", 6423 "Latin Extended-B", 6424 "IPA Extensions", 6425 "Spacing Modifier Letters", 6426 "Combining Diacritical Marks", 6427 "Greek", 6428 "Cyrillic", 6429 "Armenian", 6430 "Hebrew", 6431 "Arabic", 6432 "Syriac", 6433 "Thaana", 6434 "Devanagari", 6435 "Bengali", 6436 "Gurmukhi", 6437 "Gujarati", 6438 "Oriya", 6439 "Tamil", 6440 "Telugu", 6441 "Kannada", 6442 "Malayalam", 6443 "Sinhala", 6444 "Thai", 6445 "Lao", 6446 "Tibetan", 6447 "Myanmar", 6448 "Georgian", 6449 "Hangul Jamo", 6450 "Ethiopic", 6451 "Cherokee", 6452 "Unified Canadian Aboriginal Syllabics", 6453 "Ogham", 6454 "Runic", 6455 "Khmer", 6456 "Mongolian", 6457 "Latin Extended Additional", 6458 "Greek Extended", 6459 "General Punctuation", 6460 "Superscripts and Subscripts", 6461 "Currency Symbols", 6462 "Combining Marks for Symbols", 6463 "Letterlike Symbols", 6464 "Number Forms", 6465 "Arrows", 6466 "Mathematical Operators", 6467 "Miscellaneous Technical", 6468 "Control Pictures", 6469 "Optical Character Recognition", 6470 "Enclosed Alphanumerics", 6471 "Box Drawing", 6472 "Block Elements", 6473 "Geometric Shapes", 6474 "Miscellaneous Symbols", 6475 "Dingbats", 6476 "Braille Patterns", 6477 "CJK Radicals Supplement", 6478 "Kangxi Radicals", 6479 "Ideographic Description Characters", 6480 "CJK Symbols and Punctuation", 6481 "Hiragana", 6482 "Katakana", 6483 "Bopomofo", 6484 "Hangul Compatibility Jamo", 6485 "Kanbun", 6486 "Bopomofo Extended", 6487 "Enclosed CJK Letters and Months", 6488 "CJK Compatibility", 6489 "CJK Unified Ideographs Extension A", 6490 "CJK Unified Ideographs", 6491 "Yi Syllables", 6492 "Yi Radicals", 6493 "Hangul Syllables", 6494 "Private Use", 6495 "CJK Compatibility Ideographs", 6496 "Alphabetic Presentation Forms", 6497 "Arabic Presentation Forms-A", 6498 "Combining Half Marks", 6499 "CJK Compatibility Forms", 6500 "Small Form Variants", 6501 "Arabic Presentation Forms-B", 6502 "Specials", 6503 "Halfwidth and Fullwidth Forms", 6504 "Old Italic", "Gothic", 6507 "Deseret", 6508 "Byzantine Musical Symbols", 6509 "Musical Symbols", 6510 "Mathematical Alphanumeric Symbols", 6511 "CJK Unified Ideographs Extension B", 6512 "CJK Compatibility Ideographs Supplement", 6513 "Tags", 6514 6516 }; 6517 static final String blockRanges = 6522 "\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF\u0300\u036F" 6523 +"\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF\u0700\u074F\u0780\u07BF" 6524 +"\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF\u0C00\u0C7F\u0C80\u0CFF" 6525 +"\u0D00\u0D7F\u0D80\u0DFF\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FFF\u1000\u109F\u10A0\u10FF\u1100\u11FF" 6526 +"\u1200\u137F\u13A0\u13FF\u1400\u167F\u1680\u169F\u16A0\u16FF\u1780\u17FF\u1800\u18AF\u1E00\u1EFF" 6527 +"\u1F00\u1FFF\u2000\u206F\u2070\u209F\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF" 6528 +"\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F\u25A0\u25FF\u2600\u26FF\u2700\u27BF" 6529 +"\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F" 6530 +"\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF" 6531 +"\uAC00\uD7A3\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF" 6532 +"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF"; 6533 static final int[] nonBMPBlockRanges = { 6534 0x10300, 0x1032F, 0x10330, 0x1034F, 6536 0x10400, 0x1044F, 6537 0x1D000, 0x1D0FF, 6538 0x1D100, 0x1D1FF, 6539 0x1D400, 0x1D7FF, 6540 0x20000, 0x2A6D6, 6541 0x2F800, 0x2FA1F, 6542 0xE0000, 0xE007F 6543 }; 6544 private static final int NONBMP_BLOCK_START = 84; 6545 6546 static protected RangeToken getRange(String name, boolean positive) { 6547 if (Token.categories.size() == 0) { 6548 synchronized (Token.categories) { 6549 Token[] ranges = new Token[Token.categoryNames.length]; 6550 for (int i = 0; i < ranges.length; i ++) { 6551 ranges[i] = Token.createRange(); 6552 } 6553 int type; 6554 for (int i = 0; i < 0x10000; i ++) { 6555 type = Character.getType((char)i); 6556 if (type == Character.START_PUNCTUATION || 6557 type == Character.END_PUNCTUATION) { 6558 if (i == 0x00AB || i == 0x2018 || i == 0x201B || i == 0x201C || 6560 i == 0x201F || i == 0x2039) { 6561 type = CHAR_INIT_QUOTE; 6562 } 6563 if (i == 0x00BB || i == 0x2019 || i == 0x201D || i == 0x203A ) { 6565 type = CHAR_FINAL_QUOTE; 6566 } 6567 } 6568 ranges[type].addRange(i, i); 6569 switch (type) { 6570 case Character.UPPERCASE_LETTER: 6571 case Character.LOWERCASE_LETTER: 6572 case Character.TITLECASE_LETTER: 6573 case Character.MODIFIER_LETTER: 6574 case Character.OTHER_LETTER: 6575 type = CHAR_LETTER; 6576 break; 6577 case Character.NON_SPACING_MARK: 6578 case Character.COMBINING_SPACING_MARK: 6579 case Character.ENCLOSING_MARK: 6580 type = CHAR_MARK; 6581 break; 6582 case Character.DECIMAL_DIGIT_NUMBER: 6583 case Character.LETTER_NUMBER: 6584 case Character.OTHER_NUMBER: 6585 type = CHAR_NUMBER; 6586 break; 6587 case Character.SPACE_SEPARATOR: 6588 case Character.LINE_SEPARATOR: 6589 case Character.PARAGRAPH_SEPARATOR: 6590 type = CHAR_SEPARATOR; 6591 break; 6592 case Character.CONTROL: 6593 case Character.FORMAT: 6594 case Character.SURROGATE: 6595 case Character.PRIVATE_USE: 6596 case Character.UNASSIGNED: 6597 type = CHAR_OTHER; 6598 break; 6599 case Character.CONNECTOR_PUNCTUATION: 6600 case Character.DASH_PUNCTUATION: 6601 case Character.START_PUNCTUATION: 6602 case Character.END_PUNCTUATION: 6603 case CHAR_INIT_QUOTE: 6604 case CHAR_FINAL_QUOTE: 6605 case Character.OTHER_PUNCTUATION: 6606 type = CHAR_PUNCTUATION; 6607 break; 6608 case Character.MATH_SYMBOL: 6609 case Character.CURRENCY_SYMBOL: 6610 case Character.MODIFIER_SYMBOL: 6611 case Character.OTHER_SYMBOL: 6612 type = CHAR_SYMBOL; 6613 break; 6614 default: 6615 throw new RuntimeException ("org.apache.xerces.utils.regex.Token#getRange(): Unknown Unicode category: "+type); 6616 } 6617 ranges[type].addRange(i, i); 6618 } ranges[Character.UNASSIGNED].addRange(0x10000, Token.UTF16_MAX); 6620 6621 for (int i = 0; i < ranges.length; i ++) { 6622 if (Token.categoryNames[i] != null) { 6623 if (i == Character.UNASSIGNED) { ranges[i].addRange(0x10000, Token.UTF16_MAX); 6625 } 6626 Token.categories.put(Token.categoryNames[i], ranges[i]); 6627 Token.categories2.put(Token.categoryNames[i], 6628 Token.complementRanges(ranges[i])); 6629 } 6630 } 6631 StringBuffer buffer = new StringBuffer (50); 6635 for (int i = 0; i < Token.blockNames.length; i ++) { 6636 Token r1 = Token.createRange(); 6637 int location; 6638 if (i < NONBMP_BLOCK_START) { 6639 location = i*2; 6640 int rstart = Token.blockRanges.charAt(location); 6641 int rend = Token.blockRanges.charAt(location+1); 6642 r1.addRange(rstart, rend); 6646 } else { 6647 location = (i - NONBMP_BLOCK_START) * 2; 6648 r1.addRange(Token.nonBMPBlockRanges[location], 6649 Token.nonBMPBlockRanges[location + 1]); 6650 } 6651 String n = Token.blockNames[i]; 6652 if (n.equals("Specials")) 6653 r1.addRange(0xfff0, 0xfffd); 6654 if (n.equals("Private Use")) { 6655 r1.addRange(0xF0000,0xFFFFD); 6656 r1.addRange(0x100000,0x10FFFD); 6657 } 6658 Token.categories.put(n, r1); 6659 Token.categories2.put(n, Token.complementRanges(r1)); 6660 buffer.setLength(0); 6661 buffer.append("Is"); 6662 if (n.indexOf(' ') >= 0) { 6663 for (int ci = 0; ci < n.length(); ci ++) 6664 if (n.charAt(ci) != ' ') buffer.append(n.charAt(ci)); 6665 } 6666 else { 6667 buffer.append(n); 6668 } 6669 Token.setAlias(buffer.toString(), n, true); 6670 } 6671 6672 Token.setAlias("ASSIGNED", "Cn", false); 6674 Token.setAlias("UNASSIGNED", "Cn", true); 6675 Token all = Token.createRange(); 6676 all.addRange(0, Token.UTF16_MAX); 6677 Token.categories.put("ALL", all); 6678 Token.categories2.put("ALL", Token.complementRanges(all)); 6679 Token.registerNonXS("ASSIGNED"); 6680 Token.registerNonXS("UNASSIGNED"); 6681 Token.registerNonXS("ALL"); 6682 6683 Token isalpha = Token.createRange(); 6684 isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); Token.categories.put("IsAlpha", isalpha); 6688 Token.categories2.put("IsAlpha", Token.complementRanges(isalpha)); 6689 Token.registerNonXS("IsAlpha"); 6690 6691 Token isalnum = Token.createRange(); 6692 isalnum.mergeRanges(isalpha); isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); Token.categories.put("IsAlnum", isalnum); 6695 Token.categories2.put("IsAlnum", Token.complementRanges(isalnum)); 6696 Token.registerNonXS("IsAlnum"); 6697 6698 Token isspace = Token.createRange(); 6699 isspace.mergeRanges(Token.token_spaces); 6700 isspace.mergeRanges(ranges[CHAR_SEPARATOR]); Token.categories.put("IsSpace", isspace); 6702 Token.categories2.put("IsSpace", Token.complementRanges(isspace)); 6703 Token.registerNonXS("IsSpace"); 6704 6705 Token isword = Token.createRange(); 6706 isword.mergeRanges(isalnum); isword.addRange('_', '_'); 6708 Token.categories.put("IsWord", isword); 6709 Token.categories2.put("IsWord", Token.complementRanges(isword)); 6710 Token.registerNonXS("IsWord"); 6711 6712 Token isascii = Token.createRange(); 6713 isascii.addRange(0, 127); 6714 Token.categories.put("IsASCII", isascii); 6715 Token.categories2.put("IsASCII", Token.complementRanges(isascii)); 6716 Token.registerNonXS("IsASCII"); 6717 6718 Token isnotgraph = Token.createRange(); 6719 isnotgraph.mergeRanges(ranges[CHAR_OTHER]); 6720 isnotgraph.addRange(' ', ' '); 6721 Token.categories.put("IsGraph", Token.complementRanges(isnotgraph)); 6722 Token.categories2.put("IsGraph", isnotgraph); 6723 Token.registerNonXS("IsGraph"); 6724 6725 Token isxdigit = Token.createRange(); 6726 isxdigit.addRange('0', '9'); 6727 isxdigit.addRange('A', 'F'); 6728 isxdigit.addRange('a', 'f'); 6729 Token.categories.put("IsXDigit", Token.complementRanges(isxdigit)); 6730 Token.categories2.put("IsXDigit", isxdigit); 6731 Token.registerNonXS("IsXDigit"); 6732 6733 Token.setAlias("IsDigit", "Nd", true); 6734 Token.setAlias("IsUpper", "Lu", true); 6735 Token.setAlias("IsLower", "Ll", true); 6736 Token.setAlias("IsCntrl", "C", true); 6737 Token.setAlias("IsPrint", "C", false); 6738 Token.setAlias("IsPunct", "P", true); 6739 Token.registerNonXS("IsDigit"); 6740 Token.registerNonXS("IsUpper"); 6741 Token.registerNonXS("IsLower"); 6742 Token.registerNonXS("IsCntrl"); 6743 Token.registerNonXS("IsPrint"); 6744 Token.registerNonXS("IsPunct"); 6745 6746 Token.setAlias("alpha", "IsAlpha", true); 6747 Token.setAlias("alnum", "IsAlnum", true); 6748 Token.setAlias("ascii", "IsASCII", true); 6749 Token.setAlias("cntrl", "IsCntrl", true); 6750 Token.setAlias("digit", "IsDigit", true); 6751 Token.setAlias("graph", "IsGraph", true); 6752 Token.setAlias("lower", "IsLower", true); 6753 Token.setAlias("print", "IsPrint", true); 6754 Token.setAlias("punct", "IsPunct", true); 6755 Token.setAlias("space", "IsSpace", true); 6756 Token.setAlias("upper", "IsUpper", true); 6757 Token.setAlias("word", "IsWord", true); Token.setAlias("xdigit", "IsXDigit", true); 6759 Token.registerNonXS("alpha"); 6760 Token.registerNonXS("alnum"); 6761 Token.registerNonXS("ascii"); 6762 Token.registerNonXS("cntrl"); 6763 Token.registerNonXS("digit"); 6764 Token.registerNonXS("graph"); 6765 Token.registerNonXS("lower"); 6766 Token.registerNonXS("print"); 6767 Token.registerNonXS("punct"); 6768 Token.registerNonXS("space"); 6769 Token.registerNonXS("upper"); 6770 Token.registerNonXS("word"); 6771 Token.registerNonXS("xdigit"); 6772 } } RangeToken tok = positive ? (RangeToken)Token.categories.get(name) 6775 : (RangeToken)Token.categories2.get(name); 6776 return tok; 6778 } 6779 static protected RangeToken getRange(String name, boolean positive, boolean xs) { 6780 RangeToken range = Token.getRange(name, positive); 6781 if (xs && range != null && Token.isRegisterNonXS(name)) 6782 range = null; 6783 return range; 6784 } 6785 6786 static Hashtable nonxs = null; 6787 6791 static protected void registerNonXS(String name) { 6792 if (Token.nonxs == null) 6793 Token.nonxs = new Hashtable (); 6794 Token.nonxs.put(name, name); 6795 } 6796 static protected boolean isRegisterNonXS(String name) { 6797 if (Token.nonxs == null) 6798 return false; 6799 return Token.nonxs.containsKey(name); 6802 } 6803 6804 private static void setAlias(String newName, String name, boolean positive) { 6805 Token t1 = (Token)Token.categories.get(name); 6806 Token t2 = (Token)Token.categories2.get(name); 6807 if (positive) { 6808 Token.categories.put(newName, t1); 6809 Token.categories2.put(newName, t2); 6810 } else { 6811 Token.categories2.put(newName, t1); 6812 Token.categories.put(newName, t2); 6813 } 6814 } 6815 6816 6818 static final String viramaString = 6819 "\u094D" +"\u09CD" +"\u0A4D" +"\u0ACD" +"\u0B4D" +"\u0BCD" +"\u0C4D" +"\u0CCD" +"\u0D4D" +"\u0E3A" +"\u0F84"; 6831 static private Token token_grapheme = null; 6832 static synchronized Token getGraphemePattern() { 6833 if (Token.token_grapheme != null) 6834 return Token.token_grapheme; 6835 6836 Token base_char = Token.createRange(); base_char.mergeRanges(Token.getRange("ASSIGNED", true)); 6838 base_char.subtractRanges(Token.getRange("M", true)); 6839 base_char.subtractRanges(Token.getRange("C", true)); 6840 6841 Token virama = Token.createRange(); 6842 for (int i = 0; i < Token.viramaString.length(); i ++) { 6843 virama.addRange(i, i); 6844 } 6845 6846 Token combiner_wo_virama = Token.createRange(); 6847 combiner_wo_virama.mergeRanges(Token.getRange("M", true)); 6848 combiner_wo_virama.addRange(0x1160, 0x11ff); combiner_wo_virama.addRange(0xff9e, 0xff9f); 6851 Token left = Token.createUnion(); left.addChild(base_char); 6853 left.addChild(Token.token_empty); 6854 6855 Token foo = Token.createUnion(); 6856 foo.addChild(Token.createConcat(virama, Token.getRange("L", true))); 6857 foo.addChild(combiner_wo_virama); 6858 6859 foo = Token.createClosure(foo); 6860 6861 foo = Token.createConcat(left, foo); 6862 6863 Token.token_grapheme = foo; 6864 return Token.token_grapheme; 6865 } 6866 6867 6870 static private Token token_ccs = null; 6871 static synchronized Token getCombiningCharacterSequence() { 6872 if (Token.token_ccs != null) 6873 return Token.token_ccs; 6874 6875 Token foo = Token.createClosure(Token.getRange("M", true)); foo = Token.createConcat(Token.getRange("M", false), foo); Token.token_ccs = foo; 6878 return Token.token_ccs; 6879 } 6880 6881 6883 6887 static class StringToken extends Token implements java.io.Serializable { 6888 String string; 6889 int refNumber; 6890 6891 StringToken(int type, String str, int n) { 6892 super(type); 6893 this.string = str; 6894 this.refNumber = n; 6895 } 6896 6897 int getReferenceNumber() { return this.refNumber; 6899 } 6900 String getString() { return this.string; 6902 } 6903 6904 public String toString(int options) { 6905 if (this.type == BACKREFERENCE) 6906 return "\\"+this.refNumber; 6907 else 6908 return REUtil.quoteMeta(this.string); 6909 } 6910 } 6911 6912 6915 static class ConcatToken extends Token implements java.io.Serializable { 6916 Token child; 6917 Token child2; 6918 6919 ConcatToken(Token t1, Token t2) { 6920 super(Token.CONCAT); 6921 this.child = t1; 6922 this.child2 = t2; 6923 } 6924 6925 int size() { 6926 return 2; 6927 } 6928 Token getChild(int index) { 6929 return index == 0 ? this.child : this.child2; 6930 } 6931 6932 public String toString(int options) { 6933 String ret; 6934 if (this.child2.type == CLOSURE && this.child2.getChild(0) == this.child) { 6935 ret = this.child.toString(options)+"+"; 6936 } else if (this.child2.type == NONGREEDYCLOSURE && this.child2.getChild(0) == this.child) { 6937 ret = this.child.toString(options)+"+?"; 6938 } else 6939 ret = this.child.toString(options)+this.child2.toString(options); 6940 return ret; 6941 } 6942 } 6943 6944 6947 static class CharToken extends Token implements java.io.Serializable { 6948 int chardata; 6949 6950 CharToken(int type, int ch) { 6951 super(type); 6952 this.chardata = ch; 6953 } 6954 6955 int getChar() { 6956 return this.chardata; 6957 } 6958 6959 public String toString(int options) { 6960 String ret; 6961 switch (this.type) { 6962 case CHAR: 6963 switch (this.chardata) { 6964 case '|': case '*': case '+': case '?': 6965 case '(': case ')': case '.': case '[': 6966 case '{': case '\\': 6967 ret = "\\"+(char)this.chardata; 6968 break; 6969 case '\f': ret = "\\f"; break; 6970 case '\n': ret = "\\n"; break; 6971 case '\r': ret = "\\r"; break; 6972 case '\t': ret = "\\t"; break; 6973 case 0x1b: ret = "\\e"; break; 6974 default: 6976 if (this.chardata >= 0x10000) { 6977 String pre = "0"+Integer.toHexString(this.chardata); 6978 ret = "\\v"+pre.substring(pre.length()-6, pre.length()); 6979 } else 6980 ret = ""+(char)this.chardata; 6981 } 6982 break; 6983 6984 case ANCHOR: 6985 if (this == Token.token_linebeginning || this == Token.token_lineend) 6986 ret = ""+(char)this.chardata; 6987 else 6988 ret = "\\"+(char)this.chardata; 6989 break; 6990 6991 default: 6992 ret = null; 6993 } 6994 return ret; 6995 } 6996 6997 boolean match(int ch) { 6998 if (this.type == CHAR) { 6999 return ch == this.chardata; 7000 } else 7001 throw new RuntimeException ("NFAArrow#match(): Internal error: "+this.type); 7002 } 7003 } 7004 7005 7008 static class ClosureToken extends Token implements java.io.Serializable { 7009 int min; 7010 int max; 7011 Token child; 7012 7013 ClosureToken(int type, Token tok) { 7014 super(type); 7015 this.child = tok; 7016 this.setMin(-1); 7017 this.setMax(-1); 7018 } 7019 7020 int size() { 7021 return 1; 7022 } 7023 Token getChild(int index) { 7024 return this.child; 7025 } 7026 7027 final void setMin(int min) { 7028 this.min = min; 7029 } 7030 final void setMax(int max) { 7031 this.max = max; 7032 } 7033 final int getMin() { 7034 return this.min; 7035 } 7036 final int getMax() { 7037 return this.max; 7038 } 7039 7040 public String toString(int options) { 7041 String ret; 7042 if (this.type == CLOSURE) { 7043 if (this.getMin() < 0 && this.getMax() < 0) { 7044 ret = this.child.toString(options)+"*"; 7045 } else if (this.getMin() == this.getMax()) { 7046 ret = this.child.toString(options)+"{"+this.getMin()+"}"; 7047 } else if (this.getMin() >= 0 && this.getMax() >= 0) { 7048 ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}"; 7049 } else if (this.getMin() >= 0 && this.getMax() < 0) { 7050 ret = this.child.toString(options)+"{"+this.getMin()+",}"; 7051 } else 7052 throw new RuntimeException ("Token#toString(): CLOSURE " 7053 +this.getMin()+", "+this.getMax()); 7054 } else { 7055 if (this.getMin() < 0 && this.getMax() < 0) { 7056 ret = this.child.toString(options)+"*?"; 7057 } else if (this.getMin() == this.getMax()) { 7058 ret = this.child.toString(options)+"{"+this.getMin()+"}?"; 7059 } else if (this.getMin() >= 0 && this.getMax() >= 0) { 7060 ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}?"; 7061 } else if (this.getMin() >= 0 && this.getMax() < 0) { 7062 ret = this.child.toString(options)+"{"+this.getMin()+",}?"; 7063 } else 7064 throw new RuntimeException ("Token#toString(): NONGREEDYCLOSURE " 7065 + this.getMin() + ", " + this.getMax()); 7066 } 7067 return ret; 7068 } 7069 } 7070 7071 7074 static class ParenToken extends Token implements java.io.Serializable 7075 { 7076 Token child; 7077 7078 int parennumber; 7079 7080 ParenToken(int type, Token tok, int paren) 7081 { 7082 super(type); 7083 this.child = tok; 7084 this.parennumber = paren; 7085 } 7086 7087 int size() 7088 { 7089 return 1; 7090 } 7091 7092 Token getChild(int index) 7093 { 7094 return this.child; 7095 } 7096 7097 int getParenNumber() 7098 { 7099 return this.parennumber; 7100 } 7101 7102 public String toString(int options) 7103 { 7104 String ret = null; 7105 switch (this.type) 7106 { 7107 case PAREN: 7108 if (this.parennumber == 0) 7109 { 7110 ret = "(?:" + this.child.toString(options) + ")"; 7111 } 7112 else 7113 { 7114 ret = "(" + this.child.toString(options) + ")"; 7115 } 7116 break; 7117 7118 case LOOKAHEAD: 7119 ret = "(?=" + this.child.toString(options) + ")"; 7120 break; 7121 case NEGATIVELOOKAHEAD: 7122 ret = "(?!" + this.child.toString(options) + ")"; 7123 break; 7124 case LOOKBEHIND: 7125 ret = "(?<=" + this.child.toString(options) + ")"; 7126 break; 7127 case NEGATIVELOOKBEHIND: 7128 ret = "(?<!" + this.child.toString(options) + ")"; 7129 break; 7130 case INDEPENDENT: 7131 ret = "(?>" + this.child.toString(options) + ")"; 7132 break; 7133 } 7134 return ret; 7135 } 7136 } 7137 7138 7141 static class ConditionToken extends Token implements java.io.Serializable 7142 { 7143 int refNumber; 7144 7145 Token condition; 7146 7147 Token yes; 7148 7149 Token no; 7150 7151 ConditionToken(int refno, Token cond, Token yespat, Token nopat) 7152 { 7153 super(Token.CONDITION); 7154 this.refNumber = refno; 7155 this.condition = cond; 7156 this.yes = yespat; 7157 this.no = nopat; 7158 } 7159 7160 int size() 7161 { 7162 return this.no == null ? 1 : 2; 7163 } 7164 7165 Token getChild(int index) 7166 { 7167 if (index == 0) 7168 return this.yes; 7169 if (index == 1) 7170 return this.no; 7171 throw new RuntimeException ("Internal Error: " + index); 7172 } 7173 7174 public String toString(int options) 7175 { 7176 String ret; 7177 if (refNumber > 0) 7178 { 7179 ret = "(?(" + refNumber + ")"; 7180 } 7181 else if (this.condition.type == Token.ANCHOR) 7182 { 7183 ret = "(?(" + this.condition + ")"; 7184 } 7185 else 7186 { 7187 ret = "(?" + this.condition; 7188 } 7189 7190 if (this.no == null) 7191 { 7192 ret += this.yes + ")"; 7193 } 7194 else 7195 { 7196 ret += this.yes + "|" + this.no + ")"; 7197 } 7198 return ret; 7199 } 7200 } 7201 7202 7205 static class ModifierToken extends Token implements java.io.Serializable 7206 { 7207 Token child; 7208 7209 int add; 7210 7211 int mask; 7212 7213 ModifierToken(Token tok, int add, int mask) 7214 { 7215 super(Token.MODIFIERGROUP); 7216 this.child = tok; 7217 this.add = add; 7218 this.mask = mask; 7219 } 7220 7221 int size() 7222 { 7223 return 1; 7224 } 7225 7226 Token getChild(int index) 7227 { 7228 return this.child; 7229 } 7230 7231 int getOptions() 7232 { 7233 return this.add; 7234 } 7235 7236 int getOptionsMask() 7237 { 7238 return this.mask; 7239 } 7240 7241 public String toString(int options) 7242 { 7243 return "(?" + (this.add == 0 ? "" : REUtil.createOptionString(this.add)) 7244 + (this.mask == 0 ? "" : REUtil.createOptionString(this.mask)) + ":" + this.child.toString(options) + ")"; 7245 } 7246 } 7247 7248 7252 static class UnionToken extends Token implements java.io.Serializable 7253 { 7254 Vector children; 7255 7256 UnionToken(int type) 7257 { 7258 super(type); 7259 } 7260 7261 void addChild(Token tok) 7262 { 7263 if (tok == null) 7264 return; 7265 if (this.children == null) 7266 this.children = new Vector (); 7267 if (this.type == UNION) 7268 { 7269 this.children.addElement(tok); 7270 return; 7271 } 7272 if (tok.type == CONCAT) 7274 { 7275 for (int i = 0; i < tok.size(); i++) 7276 this.addChild(tok.getChild(i)); return; 7278 } 7279 int size = this.children.size(); 7280 if (size == 0) 7281 { 7282 this.children.addElement(tok); 7283 return; 7284 } 7285 Token previous = (Token)this.children.elementAt(size - 1); 7286 if (!((previous.type == CHAR || previous.type == STRING) && (tok.type == CHAR || tok.type == STRING))) 7287 { 7288 this.children.addElement(tok); 7289 return; 7290 } 7291 7292 7294 StringBuffer buffer; 7295 int nextMaxLength = (tok.type == CHAR ? 2 : tok.getString().length()); 7296 if (previous.type == CHAR) 7297 { buffer = new StringBuffer (2 + nextMaxLength); 7299 int ch = previous.getChar(); 7300 if (ch >= 0x10000) 7301 buffer.append(REUtil.decomposeToSurrogates(ch)); 7302 else 7303 buffer.append((char)ch); 7304 previous = Token.createString(null); 7305 this.children.setElementAt(previous, size - 1); 7306 } 7307 else 7308 { buffer = new StringBuffer (previous.getString().length() + nextMaxLength); 7310 buffer.append(previous.getString()); 7311 } 7312 7313 if (tok.type == CHAR) 7314 { 7315 int ch = tok.getChar(); 7316 if (ch >= 0x10000) 7317 buffer.append(REUtil.decomposeToSurrogates(ch)); 7318 else 7319 buffer.append((char)ch); 7320 } 7321 else 7322 { 7323 buffer.append(tok.getString()); 7324 } 7325 7326 ((StringToken)previous).string = new String (buffer); 7327 } 7328 7329 int size() 7330 { 7331 return this.children == null ? 0 : this.children.size(); 7332 } 7333 7334 Token getChild(int index) 7335 { 7336 return (Token)this.children.elementAt(index); 7337 } 7338 7339 public String toString(int options) 7340 { 7341 String ret; 7342 if (this.type == CONCAT) 7343 { 7344 if (this.children.size() == 2) 7345 { 7346 Token ch = this.getChild(0); 7347 Token ch2 = this.getChild(1); 7348 if (ch2.type == CLOSURE && ch2.getChild(0) == ch) 7349 { 7350 ret = ch.toString(options) + "+"; 7351 } 7352 else if (ch2.type == NONGREEDYCLOSURE && ch2.getChild(0) == ch) 7353 { 7354 ret = ch.toString(options) + "+?"; 7355 } 7356 else 7357 ret = ch.toString(options) + ch2.toString(options); 7358 } 7359 else 7360 { 7361 StringBuffer sb = new StringBuffer (); 7362 for (int i = 0; i < this.children.size(); i++) 7363 { 7364 sb.append(((Token)this.children.elementAt(i)).toString(options)); 7365 } 7366 ret = new String (sb); 7367 } 7368 return ret; 7369 } 7370 if (this.children.size() == 2 && this.getChild(1).type == EMPTY) 7371 { 7372 ret = this.getChild(0).toString(options) + "?"; 7373 } 7374 else if (this.children.size() == 2 && this.getChild(0).type == EMPTY) 7375 { 7376 ret = this.getChild(1).toString(options) + "??"; 7377 } 7378 else 7379 { 7380 StringBuffer sb = new StringBuffer (); 7381 sb.append(((Token)this.children.elementAt(0)).toString(options)); 7382 for (int i = 1; i < this.children.size(); i++) 7383 { 7384 sb.append('|'); 7385 sb.append(((Token)this.children.elementAt(i)).toString(options)); 7386 } 7387 ret = new String (sb); 7388 } 7389 return ret; 7390 } 7391 } 7392 } 7393 7394 7400 static class ParserForXMLSchema extends RegexParser 7401 { 7402 7403 public ParserForXMLSchema() 7404 { 7405 } 7407 7408 public ParserForXMLSchema(Locale locale) 7409 { 7410 } 7412 7413 Token processCaret() throws ParseException 7414 { 7415 this.next(); 7416 return Token.createChar('^'); 7417 } 7418 7419 Token processDollar() throws ParseException 7420 { 7421 this.next(); 7422 return Token.createChar('$'); 7423 } 7424 7425 Token processLookahead() throws ParseException 7426 { 7427 throw ex("parser.process.1", this.offset); 7428 } 7429 7430 Token processNegativelookahead() throws ParseException 7431 { 7432 throw ex("parser.process.1", this.offset); 7433 } 7434 7435 Token processLookbehind() throws ParseException 7436 { 7437 throw ex("parser.process.1", this.offset); 7438 } 7439 7440 Token processNegativelookbehind() throws ParseException 7441 { 7442 throw ex("parser.process.1", this.offset); 7443 } 7444 7445 Token processBacksolidus_A() throws ParseException 7446 { 7447 throw ex("parser.process.1", this.offset); 7448 } 7449 7450 Token processBacksolidus_Z() throws ParseException 7451 { 7452 throw ex("parser.process.1", this.offset); 7453 } 7454 7455 Token processBacksolidus_z() throws ParseException 7456 { 7457 throw ex("parser.process.1", this.offset); 7458 } 7459 7460 Token processBacksolidus_b() throws ParseException 7461 { 7462 throw ex("parser.process.1", this.offset); 7463 } 7464 7465 Token processBacksolidus_B() throws ParseException 7466 { 7467 throw ex("parser.process.1", this.offset); 7468 } 7469 7470 Token processBacksolidus_lt() throws ParseException 7471 { 7472 throw ex("parser.process.1", this.offset); 7473 } 7474 7475 Token processBacksolidus_gt() throws ParseException 7476 { 7477 throw ex("parser.process.1", this.offset); 7478 } 7479 7480 Token processStar(Token tok) throws ParseException 7481 { 7482 this.next(); 7483 return Token.createClosure(tok); 7484 } 7485 7486 Token processPlus(Token tok) throws ParseException 7487 { 7488 this.next(); 7490 return Token.createConcat(tok, Token.createClosure(tok)); 7491 } 7492 7493 Token processQuestion(Token tok) throws ParseException 7494 { 7495 this.next(); 7497 Token par = Token.createUnion(); 7498 par.addChild(tok); 7499 par.addChild(Token.createEmpty()); 7500 return par; 7501 } 7502 7503 boolean checkQuestion(int off) 7504 { 7505 return false; 7506 } 7507 7508 Token processParen() throws ParseException 7509 { 7510 this.next(); 7511 Token tok = Token.createParen(this.parseRegex(), 0); 7512 if (this.read() != T_RPAREN) 7513 throw ex("parser.factor.1", this.offset - 1); 7514 this.next(); return tok; 7516 } 7517 7518 Token processParen2() throws ParseException 7519 { 7520 throw ex("parser.process.1", this.offset); 7521 } 7522 7523 Token processCondition() throws ParseException 7524 { 7525 throw ex("parser.process.1", this.offset); 7526 } 7527 7528 Token processModifiers() throws ParseException 7529 { 7530 throw ex("parser.process.1", this.offset); 7531 } 7532 7533 Token processIndependent() throws ParseException 7534 { 7535 throw ex("parser.process.1", this.offset); 7536 } 7537 7538 Token processBacksolidus_c() throws ParseException 7539 { 7540 this.next(); 7541 return this.getTokenForShorthand('c'); 7542 } 7543 7544 Token processBacksolidus_C() throws ParseException 7545 { 7546 this.next(); 7547 return this.getTokenForShorthand('C'); 7548 } 7549 7550 Token processBacksolidus_i() throws ParseException 7551 { 7552 this.next(); 7553 return this.getTokenForShorthand('i'); 7554 } 7555 7556 Token processBacksolidus_I() throws ParseException 7557 { 7558 this.next(); 7559 return this.getTokenForShorthand('I'); 7560 } 7561 7562 Token processBacksolidus_g() throws ParseException 7563 { 7564 throw this.ex("parser.process.1", this.offset - 2); 7565 } 7566 7567 Token processBacksolidus_X() throws ParseException 7568 { 7569 throw ex("parser.process.1", this.offset - 2); 7570 } 7571 7572 Token processBackreference() throws ParseException 7573 { 7574 throw ex("parser.process.1", this.offset - 4); 7575 } 7576 7577 int processCIinCharacterClass(RangeToken tok, int c) 7578 { 7579 tok.mergeRanges(this.getTokenForShorthand(c)); 7580 return -1; 7581 } 7582 7583 7600 protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException 7601 { 7602 this.setContext(S_INBRACKETS); 7603 this.next(); boolean nrange = false; 7605 RangeToken base = null; 7606 RangeToken tok; 7607 if (this.read() == T_CHAR && this.chardata == '^') 7608 { 7609 nrange = true; 7610 this.next(); base = Token.createRange(); 7612 base.addRange(0, Token.UTF16_MAX); 7613 tok = Token.createRange(); 7614 } 7615 else 7616 { 7617 tok = Token.createRange(); 7618 } 7619 int type; 7620 boolean firstloop = true; 7621 while ((type = this.read()) != T_EOF) 7622 { if (type == T_CHAR && this.chardata == ']' && !firstloop) 7625 { 7626 if (nrange) 7627 { 7628 base.subtractRanges(tok); 7629 tok = base; 7630 } 7631 break; 7632 } 7633 int c = this.chardata; 7634 boolean end = false; 7635 if (type == T_BACKSOLIDUS) 7636 { 7637 switch (c) 7638 { 7639 case 'd': 7640 case 'D': 7641 case 'w': 7642 case 'W': 7643 case 's': 7644 case 'S': 7645 tok.mergeRanges(this.getTokenForShorthand(c)); 7646 end = true; 7647 break; 7648 7649 case 'i': 7650 case 'I': 7651 case 'c': 7652 case 'C': 7653 c = this.processCIinCharacterClass(tok, c); 7654 if (c < 0) 7655 end = true; 7656 break; 7657 7658 case 'p': 7659 case 'P': 7660 int pstart = this.offset; 7661 RangeToken tok2 = this.processBacksolidus_pP(c); 7662 if (tok2 == null) 7663 throw this.ex("parser.atom.5", pstart); 7664 tok.mergeRanges(tok2); 7665 end = true; 7666 break; 7667 7668 default: 7669 c = this.decodeEscaped(); 7670 } } else if (type == T_XMLSCHEMA_CC_SUBTRACTION && !firstloop) 7673 { 7674 if (nrange) 7676 { 7677 base.subtractRanges(tok); 7678 tok = base; 7679 } 7680 RangeToken range2 = this.parseCharacterClass(false); 7681 tok.subtractRanges(range2); 7682 if (this.read() != T_CHAR || this.chardata != ']') 7683 throw this.ex("parser.cc.5", this.offset); 7684 break; } 7686 this.next(); 7687 if (!end) 7688 { if (type == T_CHAR) 7690 { 7691 if (c == '[') 7692 throw this.ex("parser.cc.6", this.offset - 2); 7693 if (c == ']') 7694 throw this.ex("parser.cc.7", this.offset - 2); 7695 if (c == '-') 7696 throw this.ex("parser.cc.8", this.offset - 2); 7697 } 7698 if (this.read() != T_CHAR || this.chardata != '-') 7699 { tok.addRange(c, c); 7701 } 7702 else 7703 { this.next(); if ((type = this.read()) == T_EOF) 7707 throw this.ex("parser.cc.2", this.offset); 7708 if ((type == T_CHAR && this.chardata == ']') || type == T_XMLSCHEMA_CC_SUBTRACTION) 7710 { 7711 throw this.ex("parser.cc.8", this.offset - 1); 7712 } 7713 else 7714 { 7715 int rangeend = this.chardata; 7716 if (type == T_CHAR) 7717 { 7718 if (rangeend == '[') 7719 throw this.ex("parser.cc.6", this.offset - 1); 7720 if (rangeend == ']') 7721 throw this.ex("parser.cc.7", this.offset - 1); 7722 if (rangeend == '-') 7723 throw this.ex("parser.cc.8", this.offset - 2); 7724 } 7725 else if (type == T_BACKSOLIDUS) 7726 rangeend = this.decodeEscaped(); 7727 this.next(); 7728 7729 if (c > rangeend) 7730 throw this.ex("parser.ope.3", this.offset - 1); 7731 tok.addRange(c, rangeend); 7732 } 7733 } 7734 } 7735 firstloop = false; 7736 } 7737 if (this.read() == T_EOF) 7738 throw this.ex("parser.cc.2", this.offset); 7739 tok.sortRanges(); 7740 tok.compactRanges(); 7741 this.setContext(S_NORMAL); 7743 this.next(); 7745 return tok; 7746 } 7747 7748 protected RangeToken parseSetOperations() throws ParseException 7749 { 7750 throw this.ex("parser.process.1", this.offset); 7751 } 7752 7753 Token getTokenForShorthand(int ch) 7754 { 7755 switch (ch) 7756 { 7757 case 'd': 7758 return ParserForXMLSchema.getRange("xml:isDigit", true); 7759 case 'D': 7760 return ParserForXMLSchema.getRange("xml:isDigit", false); 7761 case 'w': 7762 return ParserForXMLSchema.getRange("xml:isWord", true); 7763 case 'W': 7764 return ParserForXMLSchema.getRange("xml:isWord", false); 7765 case 's': 7766 return ParserForXMLSchema.getRange("xml:isSpace", true); 7767 case 'S': 7768 return ParserForXMLSchema.getRange("xml:isSpace", false); 7769 case 'c': 7770 return ParserForXMLSchema.getRange("xml:isNameChar", true); 7771 case 'C': 7772 return ParserForXMLSchema.getRange("xml:isNameChar", false); 7773 case 'i': 7774 return ParserForXMLSchema.getRange("xml:isInitialNameChar", true); 7775 case 'I': 7776 return ParserForXMLSchema.getRange("xml:isInitialNameChar", false); 7777 default: 7778 throw new RuntimeException ("Internal Error: shorthands: \\u" + Integer.toString(ch, 16)); 7779 } 7780 } 7781 7782 int decodeEscaped() throws ParseException 7783 { 7784 if (this.read() != T_BACKSOLIDUS) 7785 throw ex("parser.next.1", this.offset - 1); 7786 int c = this.chardata; 7787 switch (c) 7788 { 7789 case 'n': 7790 c = '\n'; 7791 break; case 'r': 7793 c = '\r'; 7794 break; case 't': 7796 c = '\t'; 7797 break; case '\\': 7799 case '|': 7800 case '.': 7801 case '^': 7802 case '-': 7803 case '?': 7804 case '*': 7805 case '+': 7806 case '{': 7807 case '}': 7808 case '(': 7809 case ')': 7810 case '[': 7811 case ']': 7812 break; default: 7814 throw ex("parser.process.1", this.offset - 2); 7815 } 7816 return c; 7817 } 7818 7819 static private Hashtable ranges = null; 7820 7821 static private Hashtable ranges2 = null; 7822 7823 static synchronized protected RangeToken getRange(String name, boolean positive) 7824 { 7825 if (ranges == null) 7826 { 7827 ranges = new Hashtable (); 7828 ranges2 = new Hashtable (); 7829 7830 Token tok = Token.createRange(); 7831 setupRange(tok, SPACES); 7832 ranges.put("xml:isSpace", tok); 7833 ranges2.put("xml:isSpace", Token.complementRanges(tok)); 7834 7835 tok = Token.createRange(); 7836 setupRange(tok, DIGITS); 7837 ranges.put("xml:isDigit", tok); 7838 ranges2.put("xml:isDigit", Token.complementRanges(tok)); 7839 7840 tok = Token.createRange(); 7841 setupRange(tok, DIGITS); 7842 ranges.put("xml:isDigit", tok); 7843 ranges2.put("xml:isDigit", Token.complementRanges(tok)); 7844 7845 tok = Token.createRange(); 7846 setupRange(tok, LETTERS); 7847 tok.mergeRanges((Token)ranges.get("xml:isDigit")); 7848 ranges.put("xml:isWord", tok); 7849 ranges2.put("xml:isWord", Token.complementRanges(tok)); 7850 7851 tok = Token.createRange(); 7852 setupRange(tok, NAMECHARS); 7853 ranges.put("xml:isNameChar", tok); 7854 ranges2.put("xml:isNameChar", Token.complementRanges(tok)); 7855 7856 tok = Token.createRange(); 7857 setupRange(tok, LETTERS); 7858 tok.addRange('_', '_'); 7859 tok.addRange(':', ':'); 7860 ranges.put("xml:isInitialNameChar", tok); 7861 ranges2.put("xml:isInitialNameChar", Token.complementRanges(tok)); 7862 } 7863 RangeToken tok = positive ? (RangeToken)ranges.get(name) : (RangeToken)ranges2.get(name); 7864 return tok; 7865 } 7866 7867 static void setupRange(Token range, String src) 7868 { 7869 int len = src.length(); 7870 for (int i = 0; i < len; i += 2) 7871 range.addRange(src.charAt(i), src.charAt(i + 1)); 7872 } 7873 7874 private static final String SPACES = "\t\n\r\r "; 7875 7876 private static final String NAMECHARS = "\u002d\u002e\u0030\u003a\u0041\u005a\u005f\u005f\u0061\u007a\u00b7\u00b7\u00c0\u00d6" 7877 + "\u00d8\u00f6\u00f8\u0131\u0134\u013e\u0141\u0148\u014a\u017e\u0180\u01c3\u01cd\u01f0" 7878 + "\u01f4\u01f5\u01fa\u0217\u0250\u02a8\u02bb\u02c1\u02d0\u02d1\u0300\u0345\u0360\u0361" 7879 + "\u0386\u038a\u038c\u038c\u038e\u03a1\u03a3\u03ce\u03d0\u03d6\u03da\u03da\u03dc\u03dc" 7880 + "\u03de\u03de\u03e0\u03e0\u03e2\u03f3\u0401\u040c\u040e\u044f\u0451\u045c\u045e\u0481" 7881 + "\u0483\u0486\u0490\u04c4\u04c7\u04c8\u04cb\u04cc\u04d0\u04eb\u04ee\u04f5\u04f8\u04f9" 7882 + "\u0531\u0556\u0559\u0559\u0561\u0586\u0591\u05a1\u05a3\u05b9\u05bb\u05bd\u05bf\u05bf" 7883 + "\u05c1\u05c2\u05c4\u05c4\u05d0\u05ea\u05f0\u05f2\u0621\u063a\u0640\u0652\u0660\u0669" 7884 + "\u0670\u06b7\u06ba\u06be\u06c0\u06ce\u06d0\u06d3\u06d5\u06e8\u06ea\u06ed\u06f0\u06f9" 7885 + "\u0901\u0903\u0905\u0939\u093c\u094d\u0951\u0954\u0958\u0963\u0966\u096f\u0981\u0983" 7886 + "\u0985\u098c\u098f\u0990\u0993\u09a8\u09aa\u09b0\u09b2\u09b2\u09b6\u09b9\u09bc\u09bc" 7887 + "\u09be\u09c4\u09c7\u09c8\u09cb\u09cd\u09d7\u09d7\u09dc\u09dd\u09df\u09e3\u09e6\u09f1" 7888 + "\u0a02\u0a02\u0a05\u0a0a\u0a0f\u0a10\u0a13\u0a28\u0a2a\u0a30\u0a32\u0a33\u0a35\u0a36" 7889 + "\u0a38\u0a39\u0a3c\u0a3c\u0a3e\u0a42\u0a47\u0a48\u0a4b\u0a4d\u0a59\u0a5c\u0a5e\u0a5e" 7890 + "\u0a66\u0a74\u0a81\u0a83\u0a85\u0a8b\u0a8d\u0a8d\u0a8f\u0a91\u0a93\u0aa8\u0aaa\u0ab0" 7891 + "\u0ab2\u0ab3\u0ab5\u0ab9\u0abc\u0ac5\u0ac7\u0ac9\u0acb\u0acd\u0ae0\u0ae0\u0ae6\u0aef" 7892 + "\u0b01\u0b03\u0b05\u0b0c\u0b0f\u0b10\u0b13\u0b28\u0b2a\u0b30\u0b32\u0b33\u0b36\u0b39" 7893 + "\u0b3c\u0b43\u0b47\u0b48\u0b4b\u0b4d\u0b56\u0b57\u0b5c\u0b5d\u0b5f\u0b61\u0b66\u0b6f" 7894 + "\u0b82\u0b83\u0b85\u0b8a\u0b8e\u0b90\u0b92\u0b95\u0b99\u0b9a\u0b9c\u0b9c\u0b9e\u0b9f" 7895 + "\u0ba3\u0ba4\u0ba8\u0baa\u0bae\u0bb5\u0bb7\u0bb9\u0bbe\u0bc2\u0bc6\u0bc8\u0bca\u0bcd" 7896 + "\u0bd7\u0bd7\u0be7\u0bef\u0c01\u0c03\u0c05\u0c0c\u0c0e\u0c10\u0c12\u0c28\u0c2a\u0c33" 7897 + "\u0c35\u0c39\u0c3e\u0c44\u0c46\u0c48\u0c4a\u0c4d\u0c55\u0c56\u0c60\u0c61\u0c66\u0c6f" 7898 + "\u0c82\u0c83\u0c85\u0c8c\u0c8e\u0c90\u0c92\u0ca8\u0caa\u0cb3\u0cb5\u0cb9\u0cbe\u0cc4" 7899 + "\u0cc6\u0cc8\u0cca\u0ccd\u0cd5\u0cd6\u0cde\u0cde\u0ce0\u0ce1\u0ce6\u0cef\u0d02\u0d03" 7900 + "\u0d05\u0d0c\u0d0e\u0d10\u0d12\u0d28\u0d2a\u0d39\u0d3e\u0d43\u0d46\u0d48\u0d4a\u0d4d" 7901 + "\u0d57\u0d57\u0d60\u0d61\u0d66\u0d6f\u0e01\u0e2e\u0e30\u0e3a\u0e40\u0e4e\u0e50\u0e59" 7902 + "\u0e81\u0e82\u0e84\u0e84\u0e87\u0e88\u0e8a\u0e8a\u0e8d\u0e8d\u0e94\u0e97\u0e99\u0e9f" 7903 + "\u0ea1\u0ea3\u0ea5\u0ea5\u0ea7\u0ea7\u0eaa\u0eab\u0ead\u0eae\u0eb0\u0eb9\u0ebb\u0ebd" 7904 + "\u0ec0\u0ec4\u0ec6\u0ec6\u0ec8\u0ecd\u0ed0\u0ed9\u0f18\u0f19\u0f20\u0f29\u0f35\u0f35" 7905 + "\u0f37\u0f37\u0f39\u0f39\u0f3e\u0f47\u0f49\u0f69\u0f71\u0f84\u0f86\u0f8b\u0f90\u0f95" 7906 + "\u0f97\u0f97\u0f99\u0fad\u0fb1\u0fb7\u0fb9\u0fb9\u10a0\u10c5\u10d0\u10f6\u1100\u1100" 7907 + "\u1102\u1103\u1105\u1107\u1109\u1109\u110b\u110c\u110e\u1112\u113c\u113c\u113e\u113e" 7908 + "\u1140\u1140\u114c\u114c\u114e\u114e\u1150\u1150\u1154\u1155\u1159\u1159\u115f\u1161" 7909 + "\u1163\u1163\u1165\u1165\u1167\u1167\u1169\u1169\u116d\u116e\u1172\u1173\u1175\u1175" 7910 + "\u119e\u119e\u11a8\u11a8\u11ab\u11ab\u11ae\u11af\u11b7\u11b8\u11ba\u11ba\u11bc\u11c2" 7911 + "\u11eb\u11eb\u11f0\u11f0\u11f9\u11f9\u1e00\u1e9b\u1ea0\u1ef9\u1f00\u1f15\u1f18\u1f1d" 7912 + "\u1f20\u1f45\u1f48\u1f4d\u1f50\u1f57\u1f59\u1f59\u1f5b\u1f5b\u1f5d\u1f5d\u1f5f\u1f7d" 7913 + "\u1f80\u1fb4\u1fb6\u1fbc\u1fbe\u1fbe\u1fc2\u1fc4\u1fc6\u1fcc\u1fd0\u1fd3\u1fd6\u1fdb" 7914 + "\u1fe0\u1fec\u1ff2\u1ff4\u1ff6\u1ffc\u20d0\u20dc\u20e1\u20e1\u2126\u2126\u212a\u212b" 7915 + "\u212e\u212e\u2180\u2182\u3005\u3005\u3007\u3007\u3021\u302f\u3031\u3035\u3041\u3094" 7916 + "\u3099\u309a\u309d\u309e\u30a1\u30fa\u30fc\u30fe\u3105\u312c\u4e00\u9fa5\uac00\ud7a3" + ""; 7917 7918 private static final String LETTERS = "\u0041\u005a\u0061\u007a\u00c0\u00d6\u00d8\u00f6\u00f8\u0131\u0134\u013e\u0141\u0148" 7919 + "\u014a\u017e\u0180\u01c3\u01cd\u01f0\u01f4\u01f5\u01fa\u0217\u0250\u02a8\u02bb\u02c1" 7920 + "\u0386\u0386\u0388\u038a\u038c\u038c\u038e\u03a1\u03a3\u03ce\u03d0\u03d6\u03da\u03da" 7921 + "\u03dc\u03dc\u03de\u03de\u03e0\u03e0\u03e2\u03f3\u0401\u040c\u040e\u044f\u0451\u045c" 7922 + "\u045e\u0481\u0490\u04c4\u04c7\u04c8\u04cb\u04cc\u04d0\u04eb\u04ee\u04f5\u04f8\u04f9" 7923 + "\u0531\u0556\u0559\u0559\u0561\u0586\u05d0\u05ea\u05f0\u05f2\u0621\u063a\u0641\u064a" 7924 + "\u0671\u06b7\u06ba\u06be\u06c0\u06ce\u06d0\u06d3\u06d5\u06d5\u06e5\u06e6\u0905\u0939" 7925 + "\u093d\u093d\u0958\u0961\u0985\u098c\u098f\u0990\u0993\u09a8\u09aa\u09b0\u09b2\u09b2" 7926 + "\u09b6\u09b9\u09dc\u09dd\u09df\u09e1\u09f0\u09f1\u0a05\u0a0a\u0a0f\u0a10\u0a13\u0a28" 7927 + "\u0a2a\u0a30\u0a32\u0a33\u0a35\u0a36\u0a38\u0a39\u0a59\u0a5c\u0a5e\u0a5e\u0a72\u0a74" 7928 + "\u0a85\u0a8b\u0a8d\u0a8d\u0a8f\u0a91\u0a93\u0aa8\u0aaa\u0ab0\u0ab2\u0ab3\u0ab5\u0ab9" 7929 + "\u0abd\u0abd\u0ae0\u0ae0\u0b05\u0b0c\u0b0f\u0b10\u0b13\u0b28\u0b2a\u0b30\u0b32\u0b33" 7930 + "\u0b36\u0b39\u0b3d\u0b3d\u0b5c\u0b5d\u0b5f\u0b61\u0b85\u0b8a\u0b8e\u0b90\u0b92\u0b95" 7931 + "\u0b99\u0b9a\u0b9c\u0b9c\u0b9e\u0b9f\u0ba3\u0ba4\u0ba8\u0baa\u0bae\u0bb5\u0bb7\u0bb9" 7932 + "\u0c05\u0c0c\u0c0e\u0c10\u0c12\u0c28\u0c2a\u0c33\u0c35\u0c39\u0c60\u0c61\u0c85\u0c8c" 7933 + "\u0c8e\u0c90\u0c92\u0ca8\u0caa\u0cb3\u0cb5\u0cb9\u0cde\u0cde\u0ce0\u0ce1\u0d05\u0d0c" 7934 + "\u0d0e\u0d10\u0d12\u0d28\u0d2a\u0d39\u0d60\u0d61\u0e01\u0e2e\u0e30\u0e30\u0e32\u0e33" 7935 + "\u0e40\u0e45\u0e81\u0e82\u0e84\u0e84\u0e87\u0e88\u0e8a\u0e8a\u0e8d\u0e8d\u0e94\u0e97" 7936 + "\u0e99\u0e9f\u0ea1\u0ea3\u0ea5\u0ea5\u0ea7\u0ea7\u0eaa\u0eab\u0ead\u0eae\u0eb0\u0eb0" 7937 + "\u0eb2\u0eb3\u0ebd\u0ebd\u0ec0\u0ec4\u0f40\u0f47\u0f49\u0f69\u10a0\u10c5\u10d0\u10f6" 7938 + "\u1100\u1100\u1102\u1103\u1105\u1107\u1109\u1109\u110b\u110c\u110e\u1112\u113c\u113c" 7939 + "\u113e\u113e\u1140\u1140\u114c\u114c\u114e\u114e\u1150\u1150\u1154\u1155\u1159\u1159" 7940 + "\u115f\u1161\u1163\u1163\u1165\u1165\u1167\u1167\u1169\u1169\u116d\u116e\u1172\u1173" 7941 + "\u1175\u1175\u119e\u119e\u11a8\u11a8\u11ab\u11ab\u11ae\u11af\u11b7\u11b8\u11ba\u11ba" 7942 + "\u11bc\u11c2\u11eb\u11eb\u11f0\u11f0\u11f9\u11f9\u1e00\u1e9b\u1ea0\u1ef9\u1f00\u1f15" 7943 + "\u1f18\u1f1d\u1f20\u1f45\u1f48\u1f4d\u1f50\u1f57\u1f59\u1f59\u1f5b\u1f5b\u1f5d\u1f5d" 7944 + "\u1f5f\u1f7d\u1f80\u1fb4\u1fb6\u1fbc\u1fbe\u1fbe\u1fc2\u1fc4\u1fc6\u1fcc\u1fd0\u1fd3" 7945 + "\u1fd6\u1fdb\u1fe0\u1fec\u1ff2\u1ff4\u1ff6\u1ffc\u2126\u2126\u212a\u212b\u212e\u212e" 7946 + "\u2180\u2182\u3007\u3007\u3021\u3029\u3041\u3094\u30a1\u30fa\u3105\u312c\u4e00\u9fa5" + "\uac00\ud7a3"; 7947 7948 private static final String DIGITS = "\u0030\u0039\u0660\u0669\u06F0\u06F9\u0966\u096F\u09E6\u09EF\u0A66\u0A6F\u0AE6\u0AEF" 7949 + "\u0B66\u0B6F\u0BE7\u0BEF\u0C66\u0C6F\u0CE6\u0CEF\u0D66\u0D6F\u0E50\u0E59\u0ED0\u0ED9" + "\u0F20\u0F29"; 7950 } 7951 7952} | Popular Tags |