1 57 58 package org.enhydra.apache.xerces.utils.regex; 59 60 61 import java.util.Locale ; 62 import java.util.MissingResourceException ; 63 import java.util.ResourceBundle ; 64 import java.util.Vector ; 65 66 69 class RegexParser { 70 static final int T_CHAR = 0; 71 static final int T_EOF = 1; 72 static final int T_OR = 2; static final int T_STAR = 3; static final int T_PLUS = 4; static final int T_QUESTION = 5; static final int T_LPAREN = 6; static final int T_RPAREN = 7; static final int T_DOT = 8; static final int T_LBRACKET = 9; static final int T_BACKSOLIDUS = 10; static final int T_CARET = 11; static final int T_DOLLAR = 12; static final int T_LPAREN2 = 13; static final int T_LOOKAHEAD = 14; static final int T_NEGATIVELOOKAHEAD = 15; static final int T_LOOKBEHIND = 16; static final int T_NEGATIVELOOKBEHIND = 17; static final int T_INDEPENDENT = 18; static final int T_SET_OPERATIONS = 19; static final int T_POSIX_CHARCLASS_START = 20; static final int T_COMMENT = 21; static final int T_MODIFIERS = 22; static final int T_CONDITION = 23; static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; 96 static class ReferencePosition { 97 int refNumber; 98 int position; 99 ReferencePosition(int n, int pos) { 100 this.refNumber = n; 101 this.position = pos; 102 } 103 } 104 105 int offset; 106 String regex; 107 int regexlen; 108 int options; 109 ResourceBundle resources; 110 int chardata; 111 int nexttoken; 112 static protected final int S_NORMAL = 0; 113 static protected final int S_INBRACKETS = 1; 114 static protected final int S_INXBRACKETS = 2; 115 int context = S_NORMAL; 116 int parennumber = 1; 117 boolean hasBackReferences; 118 Vector references = null; 119 120 public RegexParser() { 121 this.setLocale(Locale.getDefault()); 122 } 123 public RegexParser(Locale locale) { 124 this.setLocale(locale); 125 } 126 127 public void setLocale(Locale locale) { 128 try { 129 this.resources = ResourceBundle.getBundle("org.enhydra.apache.xerces.utils.regex.message", locale); 130 } catch (MissingResourceException mre) { 131 throw new RuntimeException ("Installation Problem??? Couldn't load messages: " 132 +mre.getMessage()); 133 } 134 } 135 136 final ParseException ex(String key, int loc) { 137 return new ParseException(this.resources.getString(key), loc); 138 } 139 140 private final boolean isSet(int flag) { 141 return (this.options & flag) == flag; 142 } 143 144 synchronized Token parse(String regex, int options) throws ParseException { 145 146 this.options = options; 147 this.offset = 0; 148 this.setContext(S_NORMAL); 149 this.parennumber = 1; 150 this.hasBackReferences = false; 151 this.regex = regex; 152 if (this.isSet(RegularExpression.EXTENDED_COMMENT)) 153 this.regex = REUtil.stripExtendedComment(this.regex); 154 this.regexlen = this.regex.length(); 155 156 157 this.next(); 158 Token ret = this.parseRegex(); 159 if (this.offset != this.regexlen) 160 throw ex("parser.parse.1", this.offset); 161 if (this.references != null) { 162 for (int i = 0; i < this.references.size(); i ++) { 163 ReferencePosition position = (ReferencePosition)this.references.elementAt(i); 164 if (this.parennumber <= position.refNumber) 165 throw ex("parser.parse.2", position.position); 166 } 167 this.references.removeAllElements(); 168 } 169 return ret; 170 } 171 172 178 179 protected final void setContext(int con) { 180 this.context = con; 181 } 182 183 final int read() { 184 return this.nexttoken; 185 } 186 187 final void next() { 188 if (this.offset >= this.regexlen) { 189 this.chardata = -1; 190 this.nexttoken = T_EOF; 191 return; 192 } 193 194 int ret; 195 int ch = this.regex.charAt(this.offset++); 196 this.chardata = ch; 197 198 if (this.context == S_INBRACKETS) { 199 switch (ch) { 202 case '\\': 203 ret = T_BACKSOLIDUS; 204 if (this.offset >= this.regexlen) 205 throw ex("parser.next.1", this.offset-1); 206 this.chardata = this.regex.charAt(this.offset++); 207 break; 208 209 case '-': 210 if (this.isSet(RegularExpression.XMLSCHEMA_MODE) 211 && this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') { 212 this.offset++; 213 ret = T_XMLSCHEMA_CC_SUBTRACTION; 214 } else 215 ret = T_CHAR; 216 break; 217 218 case '[': 219 if (!this.isSet(RegularExpression.XMLSCHEMA_MODE) 220 && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') { 221 this.offset++; 222 ret = T_POSIX_CHARCLASS_START; 223 break; 224 } default: 226 if (REUtil.isHighSurrogate(ch) && this.offset < this.regexlen) { 227 int low = this.regex.charAt(this.offset); 228 if (REUtil.isLowSurrogate(low)) { 229 this.chardata = REUtil.composeFromSurrogates(ch, low); 230 this.offset ++; 231 } 232 } 233 ret = T_CHAR; 234 } 235 this.nexttoken = ret; 236 return; 237 } 238 239 switch (ch) { 240 case '|': ret = T_OR; break; 241 case '*': ret = T_STAR; break; 242 case '+': ret = T_PLUS; break; 243 case '?': ret = T_QUESTION; break; 244 case ')': ret = T_RPAREN; break; 245 case '.': ret = T_DOT; break; 246 case '[': ret = T_LBRACKET; break; 247 case '^': ret = T_CARET; break; 248 case '$': ret = T_DOLLAR; break; 249 case '(': 250 ret = T_LPAREN; 251 if (this.offset >= this.regexlen) 252 break; 253 if (this.regex.charAt(this.offset) != '?') 254 break; 255 if (++this.offset >= this.regexlen) 256 throw ex("parser.next.2", this.offset-1); 257 ch = this.regex.charAt(this.offset++); 258 switch (ch) { 259 case ':': ret = T_LPAREN2; break; 260 case '=': ret = T_LOOKAHEAD; break; 261 case '!': ret = T_NEGATIVELOOKAHEAD; break; 262 case '[': ret = T_SET_OPERATIONS; break; 263 case '>': ret = T_INDEPENDENT; break; 264 case '<': 265 if (this.offset >= this.regexlen) 266 throw ex("parser.next.2", this.offset-3); 267 ch = this.regex.charAt(this.offset++); 268 if (ch == '=') { 269 ret = T_LOOKBEHIND; 270 } else if (ch == '!') { 271 ret = T_NEGATIVELOOKBEHIND; 272 } else 273 throw ex("parser.next.3", this.offset-3); 274 break; 275 case '#': 276 while (this.offset < this.regexlen) { 277 ch = this.regex.charAt(this.offset++); 278 if (ch == ')') break; 279 } 280 if (ch != ')') 281 throw ex("parser.next.4", this.offset-1); 282 ret = T_COMMENT; 283 break; 284 default: 285 if (ch == '-' || 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z') { this.offset --; 287 ret = T_MODIFIERS; 288 break; 289 } else if (ch == '(') { ret = T_CONDITION; break; 292 } 293 throw ex("parser.next.2", this.offset-2); 294 } 295 break; 296 297 case '\\': 298 ret = T_BACKSOLIDUS; 299 if (this.offset >= this.regexlen) 300 throw ex("parser.next.1", this.offset-1); 301 this.chardata = this.regex.charAt(this.offset++); 302 break; 303 304 default: 305 ret = T_CHAR; 306 if (REUtil.isHighSurrogate(this.chardata) && this.offset < this.regexlen) 307 this.chardata = REUtil.composeFromSurrogates(this.chardata, 308 this.regex.charAt(this.offset++)); 309 } 310 this.nexttoken = ret; 311 } 312 313 322 Token parseRegex() throws ParseException { 323 Token tok = this.parseTerm(); 324 Token parent = null; 325 while (this.read() == T_OR) { 326 this.next(); if (parent == null) { 328 parent = Token.createUnion(); 329 parent.addChild(tok); 330 tok = parent; 331 } 332 tok.addChild(this.parseTerm()); 333 } 334 335 return tok; 336 } 337 338 341 Token parseTerm() throws ParseException { 342 int ch = this.read(); 343 if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) { 344 return Token.createEmpty(); 345 } else { 346 Token tok = this.parseFactor(); 347 Token concat = null; 348 while ((ch = this.read()) != T_OR && ch != T_RPAREN && ch != T_EOF) { 349 if (concat == null) { 350 concat = Token.createConcat(); 351 concat.addChild(tok); 352 tok = concat; 353 } 354 concat.addChild(this.parseFactor()); 355 } 357 return tok; 358 } 359 } 360 361 363 Token processCaret() throws ParseException { 364 this.next(); 365 return Token.token_linebeginning; 366 } 367 Token processDollar() throws ParseException { 368 this.next(); 369 return Token.token_lineend; 370 } 371 Token processLookahead() throws ParseException { 372 this.next(); 373 Token tok = Token.createLook(Token.LOOKAHEAD, this.parseRegex()); 374 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 375 this.next(); return tok; 377 } 378 Token processNegativelookahead() throws ParseException { 379 this.next(); 380 Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this.parseRegex()); 381 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 382 this.next(); return tok; 384 } 385 Token processLookbehind() throws ParseException { 386 this.next(); 387 Token tok = Token.createLook(Token.LOOKBEHIND, this.parseRegex()); 388 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 389 this.next(); return tok; 391 } 392 Token processNegativelookbehind() throws ParseException { 393 this.next(); 394 Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this.parseRegex()); 395 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 396 this.next(); return tok; 398 } 399 Token processBacksolidus_A() throws ParseException { 400 this.next(); 401 return Token.token_stringbeginning; 402 } 403 Token processBacksolidus_Z() throws ParseException { 404 this.next(); 405 return Token.token_stringend2; 406 } 407 Token processBacksolidus_z() throws ParseException { 408 this.next(); 409 return Token.token_stringend; 410 } 411 Token processBacksolidus_b() throws ParseException { 412 this.next(); 413 return Token.token_wordedge; 414 } 415 Token processBacksolidus_B() throws ParseException { 416 this.next(); 417 return Token.token_not_wordedge; 418 } 419 Token processBacksolidus_lt() throws ParseException { 420 this.next(); 421 return Token.token_wordbeginning; 422 } 423 Token processBacksolidus_gt() throws ParseException { 424 this.next(); 425 return Token.token_wordend; 426 } 427 Token processStar(Token tok) throws ParseException { 428 this.next(); 429 if (this.read() == T_QUESTION) { 430 this.next(); 431 return Token.createNGClosure(tok); 432 } else 433 return Token.createClosure(tok); 434 } 435 Token processPlus(Token tok) throws ParseException { 436 this.next(); 438 if (this.read() == T_QUESTION) { 439 this.next(); 440 return Token.createConcat(tok, Token.createNGClosure(tok)); 441 } else 442 return Token.createConcat(tok, Token.createClosure(tok)); 443 } 444 Token processQuestion(Token tok) throws ParseException { 445 this.next(); 447 Token par = Token.createUnion(); 448 if (this.read() == T_QUESTION) { 449 this.next(); 450 par.addChild(Token.createEmpty()); 451 par.addChild(tok); 452 } else { 453 par.addChild(tok); 454 par.addChild(Token.createEmpty()); 455 } 456 return par; 457 } 458 boolean checkQuestion(int off) { 459 return off < this.regexlen && this.regex.charAt(off) == '?'; 460 } 461 Token processParen() throws ParseException { 462 this.next(); 463 int p = this.parennumber++; 464 Token tok = Token.createParen(this.parseRegex(), p); 465 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 466 this.next(); return tok; 468 } 469 Token processParen2() throws ParseException { 470 this.next(); 471 Token tok = Token.createParen(this.parseRegex(), 0); 472 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 473 this.next(); return tok; 475 } 476 Token processCondition() throws ParseException { 477 if (this.offset+1 >= this.regexlen) throw ex("parser.factor.4", this.offset); 479 int refno = -1; 481 Token condition = null; 482 int ch = this.regex.charAt(this.offset); 483 if ('1' <= ch && ch <= '9') { 484 refno = ch-'0'; 485 this.hasBackReferences = true; 486 if (this.references == null) this.references = new Vector (); 487 this.references.addElement(new ReferencePosition(refno, this.offset)); 488 this.offset ++; 489 if (this.regex.charAt(this.offset) != ')') throw ex("parser.factor.1", this.offset); 490 this.offset ++; 491 } else { 492 if (ch == '?') this.offset --; this.next(); 494 condition = this.parseFactor(); 495 switch (condition.type) { 496 case Token.LOOKAHEAD: 497 case Token.NEGATIVELOOKAHEAD: 498 case Token.LOOKBEHIND: 499 case Token.NEGATIVELOOKBEHIND: 500 break; 501 case Token.ANCHOR: 502 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 503 break; 504 default: 505 throw ex("parser.factor.5", this.offset); 506 } 507 } 508 this.next(); 510 Token yesPattern = this.parseRegex(); 511 Token noPattern = null; 512 if (yesPattern.type == Token.UNION) { 513 if (yesPattern.size() != 2) throw ex("parser.factor.6", this.offset); 514 noPattern = yesPattern.getChild(1); 515 yesPattern = yesPattern.getChild(0); 516 } 517 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 518 this.next(); 519 return Token.createCondition(refno, condition, yesPattern, noPattern); 520 } 521 Token processModifiers() throws ParseException { 522 int add = 0, mask = 0, ch = -1; 525 while (this.offset < this.regexlen) { 526 ch = this.regex.charAt(this.offset); 527 int v = REUtil.getOptionValue(ch); 528 if (v == 0) break; add |= v; 530 this.offset ++; 531 } 532 if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1); 533 if (ch == '-') { 534 this.offset ++; 535 while (this.offset < this.regexlen) { 536 ch = this.regex.charAt(this.offset); 537 int v = REUtil.getOptionValue(ch); 538 if (v == 0) break; mask |= v; 540 this.offset ++; 541 } 542 if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1); 543 } 544 Token tok; 545 if (ch == ':') { 546 this.offset ++; 547 this.next(); 548 tok = Token.createModifierGroup(this.parseRegex(), add, mask); 549 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 550 this.next(); 551 } else if (ch == ')') { this.offset ++; 553 this.next(); 554 tok = Token.createModifierGroup(this.parseRegex(), add, mask); 555 } else 556 throw ex("parser.factor.3", this.offset); 557 558 return tok; 559 } 560 Token processIndependent() throws ParseException { 561 this.next(); 562 Token tok = Token.createLook(Token.INDEPENDENT, this.parseRegex()); 563 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 564 this.next(); return tok; 566 } 567 Token processBacksolidus_c() throws ParseException { 568 int ch2; if (this.offset >= this.regexlen 570 || ((ch2 = this.regex.charAt(this.offset++)) & 0xffe0) != 0x0040) 571 throw ex("parser.atom.1", this.offset-1); 572 this.next(); 573 return Token.createChar(ch2-0x40); 574 } 575 Token processBacksolidus_C() throws ParseException { 576 throw ex("parser.process.1", this.offset); 577 } 578 Token processBacksolidus_i() throws ParseException { 579 Token tok = Token.createChar('i'); 580 this.next(); 581 return tok; 582 } 583 Token processBacksolidus_I() throws ParseException { 584 throw ex("parser.process.1", this.offset); 585 } 586 Token processBacksolidus_g() throws ParseException { 587 this.next(); 588 return Token.getGraphemePattern(); 589 } 590 Token processBacksolidus_X() throws ParseException { 591 this.next(); 592 return Token.getCombiningCharacterSequence(); 593 } 594 Token processBackreference() throws ParseException { 595 int refnum = this.chardata-'0'; 596 Token tok = Token.createBackReference(refnum); 597 this.hasBackReferences = true; 598 if (this.references == null) this.references = new Vector (); 599 this.references.addElement(new ReferencePosition(refnum, this.offset-2)); 600 this.next(); 601 return tok; 602 } 603 604 606 615 Token parseFactor() throws ParseException { 616 int ch = this.read(); 617 Token tok; 618 switch (ch) { 619 case T_CARET: return this.processCaret(); 620 case T_DOLLAR: return this.processDollar(); 621 case T_LOOKAHEAD: return this.processLookahead(); 622 case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead(); 623 case T_LOOKBEHIND: return this.processLookbehind(); 624 case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind(); 625 626 case T_COMMENT: 627 this.next(); 628 return Token.createEmpty(); 629 630 case T_BACKSOLIDUS: 631 switch (this.chardata) { 632 case 'A': return this.processBacksolidus_A(); 633 case 'Z': return this.processBacksolidus_Z(); 634 case 'z': return this.processBacksolidus_z(); 635 case 'b': return this.processBacksolidus_b(); 636 case 'B': return this.processBacksolidus_B(); 637 case '<': return this.processBacksolidus_lt(); 638 case '>': return this.processBacksolidus_gt(); 639 } 640 } 642 tok = this.parseAtom(); 643 ch = this.read(); 644 switch (ch) { 645 case T_STAR: return this.processStar(tok); 646 case T_PLUS: return this.processPlus(tok); 647 case T_QUESTION: return this.processQuestion(tok); 648 case T_CHAR: 649 if (this.chardata == '{') { 650 int off = this.offset; 652 int min = 0, max = -1; 653 if (off >= this.regexlen) break; 654 ch = this.regex.charAt(off++); 655 if (ch < '0' || ch > '9') { 656 throw new RuntimeException ("Invalid quantifier '"+(char)ch+"' in " + regex); 657 } 658 min = ch-'0'; 659 while (off < this.regexlen 660 && (ch = this.regex.charAt(off++)) >= '0' && ch <= '9') { 661 min = min*10 +ch-'0'; 662 ch = -1; 663 } 664 max = min; 665 if (ch!='}' && ch !=',' && (ch < '0' || ch > '9')) { 666 throw new RuntimeException ("Invalid quantifier '"+(char)ch+"' in " + regex); 667 } 668 671 else if (ch == ',') { 672 if (ch == '}') { 673 max = -1; } else { 675 max = ch-'0'; while (off < this.regexlen 677 && (ch = this.regex.charAt(off++)) >= '0' 678 && ch <= '9') { 679 max = max*10 +ch-'0'; 680 ch = -1; 681 } 682 685 if (ch !='}' && (ch < '0' || ch > '9')) { 686 throw new RuntimeException ( "Invalid quantifier '"+(char)ch+"' in" + regex); 687 } 688 } 689 } 690 if (this.checkQuestion(off)) { 692 tok = Token.createNGClosure(tok); 693 this.offset = off+1; 694 } else { 695 tok = Token.createClosure(tok); 696 this.offset = off; 697 } 698 tok.setMin(min); 699 tok.setMax(max); 700 this.next(); 702 } 703 } 704 return tok; 705 } 706 707 713 Token parseAtom() throws ParseException { 714 int ch = this.read(); 715 Token tok = null; 716 switch (ch) { 717 case T_LPAREN: return this.processParen(); 718 case T_LPAREN2: return this.processParen2(); case T_CONDITION: return this.processCondition(); case T_MODIFIERS: return this.processModifiers(); case T_INDEPENDENT: return this.processIndependent(); 722 case T_DOT: 723 this.next(); tok = Token.token_dot; 725 break; 726 727 734 case T_LBRACKET: return this.parseCharacterClass(true); 735 case T_SET_OPERATIONS: return this.parseSetOperations(); 736 737 case T_BACKSOLIDUS: 738 switch (this.chardata) { 739 case 'd': case 'D': 740 case 'w': case 'W': 741 case 's': case 'S': 742 tok = this.getTokenForShorthand(this.chardata); 743 this.next(); 744 return tok; 745 746 case 'e': case 'f': case 'n': case 'r': 747 case 't': case 'u': case 'v': case 'x': 748 { 749 int ch2 = this.decodeEscaped(); 750 if (ch2 < 0x10000) { 751 tok = Token.createChar(ch2); 752 } else { 753 tok = Token.createString(REUtil.decomposeToSurrogates(ch2)); 754 } 755 } 756 break; 757 758 case 'c': return this.processBacksolidus_c(); 759 case 'C': return this.processBacksolidus_C(); 760 case 'i': return this.processBacksolidus_i(); 761 case 'I': return this.processBacksolidus_I(); 762 case 'g': return this.processBacksolidus_g(); 763 case 'X': return this.processBacksolidus_X(); 764 case '1': case '2': case '3': case '4': 765 case '5': case '6': case '7': case '8': case '9': 766 return this.processBackreference(); 767 768 case 'P': 769 case 'p': 770 int pstart = this.offset; 771 tok = processBacksolidus_pP(this.chardata); 772 if (tok == null) throw this.ex("parser.atom.5", pstart); 773 break; 774 775 default: 776 tok = Token.createChar(this.chardata); 777 } 778 this.next(); 779 break; 780 781 case T_CHAR: 782 tok = Token.createChar(this.chardata); 783 this.next(); 784 break; 785 786 default: 787 throw this.ex("parser.atom.4", this.offset-1); 788 } 789 return tok; 790 } 791 792 protected RangeToken processBacksolidus_pP(int c) throws ParseException { 793 boolean positive = c == 'p'; 794 this.next(); 795 if (this.read() != T_CHAR) throw this.ex("parser.atom.2", this.offset-1); 796 RangeToken tok; 797 switch (this.chardata) { 798 case 'L': tok = Token.getRange("L", positive); break; 800 case 'M': tok = Token.getRange("M", positive); break; 802 case 'N': tok = Token.getRange("N", positive); break; 804 case 'Z': tok = Token.getRange("Z", positive); break; 806 case 'C': tok = Token.getRange("C", positive); break; 808 case 'P': tok = Token.getRange("P", positive); break; 810 case 'S': tok = Token.getRange("S", positive); break; 812 case '{': 813 int namestart = this.offset; 816 int nameend = this.regex.indexOf('}', namestart); 817 if (nameend < 0) throw this.ex("parser.atom.3", this.offset); 818 this.offset = nameend+1; 819 tok = Token.getRange(this.regex.substring(namestart, nameend), positive); 820 824 break; 825 826 default: 827 throw this.ex("parser.atom.2", this.offset-1); 828 } 829 return tok; 830 } 831 832 int processCIinCharacterClass(RangeToken tok, int c) { 833 return this.decodeEscaped(); 834 } 835 836 843 protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException { 844 this.setContext(S_INBRACKETS); 845 this.next(); boolean nrange = false; 847 RangeToken base = null; 848 RangeToken tok; 849 if (this.read() == T_CHAR && this.chardata == '^') { 850 nrange = true; 851 this.next(); if (useNrange) { 853 tok = Token.createNRange(); 854 } else { 855 base = Token.createRange(); 856 base.addRange(0, Token.UTF16_MAX); 857 tok = Token.createRange(); 858 } 859 } else { 860 tok = Token.createRange(); 861 } 862 int type; 863 boolean firstloop = true; 864 while ((type = this.read()) != T_EOF) { 865 if (type == T_CHAR && this.chardata == ']' && !firstloop) 866 break; 867 firstloop = false; 868 int c = this.chardata; 869 boolean end = false; 870 if (type == T_BACKSOLIDUS) { 871 switch (c) { 872 case 'd': case 'D': 873 case 'w': case 'W': 874 case 's': case 'S': 875 tok.mergeRanges(this.getTokenForShorthand(c)); 876 end = true; 877 break; 878 879 case 'i': case 'I': 880 case 'c': case 'C': 881 c = this.processCIinCharacterClass(tok, c); 882 if (c < 0) end = true; 883 break; 884 885 case 'p': 886 case 'P': 887 int pstart = this.offset; 888 RangeToken tok2 = this.processBacksolidus_pP(c); 889 if (tok2 == null) throw this.ex("parser.atom.5", pstart); 890 tok.mergeRanges(tok2); 891 end = true; 892 break; 893 894 default: 895 c = this.decodeEscaped(); 896 } } else if (type == T_POSIX_CHARCLASS_START) { 900 int nameend = this.regex.indexOf(':', this.offset); 901 if (nameend < 0) throw this.ex("parser.cc.1", this.offset); 902 boolean positive = true; 903 if (this.regex.charAt(this.offset) == '^') { 904 this.offset ++; 905 positive = false; 906 } 907 String name = this.regex.substring(this.offset, nameend); 908 RangeToken range = Token.getRange(name, positive); 909 if (range == null) throw this.ex("parser.cc.3", this.offset); 910 tok.mergeRanges(range); 911 end = true; 912 if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']') 913 throw this.ex("parser.cc.1", nameend); 914 this.offset = nameend+2; 915 } 916 this.next(); 917 if (!end) { if (this.read() != T_CHAR || this.chardata != '-') { tok.addRange(c, c); 920 } else { 921 this.next(); if ((type = this.read()) == T_EOF) throw this.ex("parser.cc.2", this.offset); 923 if (type == T_CHAR && this.chardata == ']') { 924 tok.addRange(c, c); 925 tok.addRange('-', '-'); 926 } else { 927 int rangeend = this.chardata; 928 if (type == T_BACKSOLIDUS) 929 rangeend = this.decodeEscaped(); 930 this.next(); 931 tok.addRange(c, rangeend); 932 } 933 } 934 } 935 if (this.isSet(RegularExpression.SPECIAL_COMMA) 936 && this.read() == T_CHAR && this.chardata == ',') 937 this.next(); 938 } 939 if (this.read() == T_EOF) 940 throw this.ex("parser.cc.2", this.offset); 941 if (!useNrange && nrange) { 942 base.subtractRanges(tok); 943 tok = base; 944 } 945 tok.sortRanges(); 946 tok.compactRanges(); 947 952 this.setContext(S_NORMAL); 953 this.next(); 955 return tok; 956 } 957 private RangeToken parseCharacterClass_old(boolean useNrange) throws ParseException { 958 this.setContext(S_INBRACKETS); 959 this.next(); boolean nrange = false; 961 RangeToken base = null; 962 RangeToken tok; 963 if (this.read() == T_CHAR && this.chardata == '^') { 964 nrange = true; 965 this.next(); if (useNrange) { 967 tok = Token.createNRange(); 968 } else { 969 base = Token.createRange(); 970 base.addRange(0, Token.UTF16_MAX); 971 tok = Token.createRange(); 972 } 973 } else { 974 tok = Token.createRange(); 975 } 976 int type; 977 while ((type = this.read()) != T_EOF 978 && !(type == T_CHAR && this.chardata == ']')) { 979 int c = this.chardata; 980 996 boolean end = false; 997 if (type == T_BACKSOLIDUS) { 998 switch (c) { 999 case 'd': case 'D': 1000 case 'w': case 'W': 1001 case 's': case 'S': 1002 tok.mergeRanges(this.getTokenForShorthand(c)); 1003 end = true; 1004 break; 1005 1006 case 'i': case 'I': 1007 case 'c': case 'C': 1008 c = this.processCIinCharacterClass(tok, c); 1009 if (c < 0) end = true; 1010 break; 1011 1012 case 'p': 1013 case 'P': 1014 boolean positive = c == 'p'; 1015 int pstart = this.offset; 1016 this.next(); 1017 if (this.read() != T_CHAR) throw ex("parser.atom.2", this.offset-1); 1018 RangeToken tok2 = null; 1019 switch (this.chardata) { 1020 case 'L': tok2 = Token.getRange("L", positive); break; 1022 case 'M': tok2 = Token.getRange("M", positive); break; 1024 case 'N': tok2 = Token.getRange("N", positive); break; 1026 case 'Z': tok2 = Token.getRange("Z", positive); break; 1028 case 'C': tok2 = Token.getRange("C", positive); break; 1030 case 'P': tok2 = Token.getRange("P", positive); break; 1032 case 'S': tok2 = Token.getRange("S", positive); break; 1034 case '{': 1035 pstart = this.offset; 1037 int namestart = this.offset; 1038 int nameend = this.regex.indexOf('}', namestart); 1039 if (nameend < 0) throw ex("parser.atom.3", this.offset); 1040 this.offset = nameend+1; 1041 tok2 = Token.getRange(this.regex.substring(namestart, nameend), positive); 1042 break; 1043 1044 default: 1045 throw ex("parser.atom.2", this.offset-1); 1046 } 1047 if (tok2 == null) throw ex("parser.atom.5", pstart); 1048 tok.mergeRanges(tok2); 1049 end = true; 1050 break; 1051 1052 default: 1053 c = this.decodeEscaped(); 1054 } } else if (type == T_POSIX_CHARCLASS_START) { 1058 int nameend = this.regex.indexOf(':', this.offset); 1059 if (nameend < 0) throw ex("parser.cc.1", this.offset); 1060 String name = this.regex.substring(this.offset, nameend); 1061 RangeToken range = Token.getRange(name, true); 1062 if (range == null) throw ex("parser.cc.3", this.offset); 1063 tok.mergeRanges(range); 1064 end = true; 1065 if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']') 1066 throw ex("parser.cc.1", nameend); 1067 this.offset = nameend+2; 1068 } 1069 this.next(); 1070 if (!end) { 1071 if (this.read() != T_CHAR || this.chardata != '-') { tok.addRange(c, c); 1073 } else { 1074 this.next(); if ((type = this.read()) == T_EOF) throw ex("parser.cc.2", this.offset); 1076 int rangeend = this.chardata; 1077 if (type == T_BACKSOLIDUS) 1078 rangeend = this.decodeEscaped(); 1079 this.next(); 1080 tok.addRange(c, rangeend); 1081 } 1082 } 1083 if (this.read() == T_CHAR && this.chardata == ',') 1084 this.next(); 1085 } 1086 if (this.read() == T_EOF) 1087 throw ex("parser.cc.2", this.offset); 1088 if (!useNrange && nrange) { 1089 base.subtractRanges(tok); 1090 tok = base; 1091 } 1092 tok.sortRanges(); 1093 tok.compactRanges(); 1094 1099 this.setContext(S_NORMAL); 1100 this.next(); 1102 return tok; 1103 } 1104 1105 1108 protected RangeToken parseSetOperations() throws ParseException { 1109 RangeToken tok = this.parseCharacterClass(false); 1110 int type; 1111 while ((type = this.read()) != T_RPAREN) { 1112 int ch = this.chardata; 1113 if (type == T_CHAR && (ch == '-' || ch == '&') 1114 || type == T_PLUS) { 1115 this.next(); 1116 if (this.read() != T_LBRACKET) throw ex("parser.ope.1", this.offset-1); 1117 RangeToken t2 = this.parseCharacterClass(false); 1118 if (type == T_PLUS) 1119 tok.mergeRanges(t2); 1120 else if (ch == '-') 1121 tok.subtractRanges(t2); 1122 else if (ch == '&') 1123 tok.intersectRanges(t2); 1124 else 1125 throw new RuntimeException ("ASSERT"); 1126 } else { 1127 throw ex("parser.ope.2", this.offset-1); 1128 } 1129 } 1130 this.next(); 1131 return tok; 1132 } 1133 1134 Token getTokenForShorthand(int ch) { 1135 Token tok; 1136 switch (ch) { 1137 case 'd': 1138 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 1139 ? Token.getRange("Nd", true) : Token.token_0to9; 1140 break; 1141 case 'D': 1142 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 1143 ? Token.getRange("Nd", false) : Token.token_not_0to9; 1144 break; 1145 case 'w': 1146 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 1147 ? Token.getRange("IsWord", true) : Token.token_wordchars; 1148 break; 1149 case 'W': 1150 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 1151 ? Token.getRange("IsWord", false) : Token.token_not_wordchars; 1152 break; 1153 case 's': 1154 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 1155 ? Token.getRange("IsSpace", true) : Token.token_spaces; 1156 break; 1157 case 'S': 1158 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 1159 ? Token.getRange("IsSpace", false) : Token.token_not_spaces; 1160 break; 1161 1162 default: 1163 throw new RuntimeException ("Internal Error: shorthands: \\u"+Integer.toString(ch, 16)); 1164 } 1165 return tok; 1166 } 1167 1168 1170 int decodeEscaped() throws ParseException { 1171 if (this.read() != T_BACKSOLIDUS) throw ex("parser.next.1", this.offset-1); 1172 int c = this.chardata; 1173 switch (c) { 1174 case 'e': c = 0x1b; break; case 'f': c = '\f'; break; case 'n': c = '\n'; break; case 'r': c = '\r'; break; case 't': c = '\t'; break; case 'x': 1181 this.next(); 1182 if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1); 1183 if (this.chardata == '{') { 1184 int v1 = 0; 1185 int uv = 0; 1186 do { 1187 this.next(); 1188 if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1); 1189 if ((v1 = hexChar(this.chardata)) < 0) 1190 break; 1191 if (uv > uv*16) throw ex("parser.descape.2", this.offset-1); 1192 uv = uv*16+v1; 1193 } while (true); 1194 if (this.chardata != '}') throw ex("parser.descape.3", this.offset-1); 1195 if (uv > Token.UTF16_MAX) throw ex("parser.descape.4", this.offset-1); 1196 c = uv; 1197 } else { 1198 int v1 = 0; 1199 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1200 throw ex("parser.descape.1", this.offset-1); 1201 int uv = v1; 1202 this.next(); 1203 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1204 throw ex("parser.descape.1", this.offset-1); 1205 uv = uv*16+v1; 1206 c = uv; 1207 } 1208 break; 1209 1210 case 'u': 1211 int v1 = 0; 1212 this.next(); 1213 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1214 throw ex("parser.descape.1", this.offset-1); 1215 int uv = v1; 1216 this.next(); 1217 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1218 throw ex("parser.descape.1", this.offset-1); 1219 uv = uv*16+v1; 1220 this.next(); 1221 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1222 throw ex("parser.descape.1", this.offset-1); 1223 uv = uv*16+v1; 1224 this.next(); 1225 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1226 throw ex("parser.descape.1", this.offset-1); 1227 uv = uv*16+v1; 1228 c = uv; 1229 break; 1230 1231 case 'v': 1232 this.next(); 1233 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1234 throw ex("parser.descape.1", this.offset-1); 1235 uv = v1; 1236 this.next(); 1237 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1238 throw ex("parser.descape.1", this.offset-1); 1239 uv = uv*16+v1; 1240 this.next(); 1241 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1242 throw ex("parser.descape.1", this.offset-1); 1243 uv = uv*16+v1; 1244 this.next(); 1245 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1246 throw ex("parser.descape.1", this.offset-1); 1247 uv = uv*16+v1; 1248 this.next(); 1249 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1250 throw ex("parser.descape.1", this.offset-1); 1251 uv = uv*16+v1; 1252 this.next(); 1253 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1254 throw ex("parser.descape.1", this.offset-1); 1255 uv = uv*16+v1; 1256 if (uv > Token.UTF16_MAX) throw ex("parser.descappe.4", this.offset-1); 1257 c = uv; 1258 break; 1259 case 'A': 1260 case 'Z': 1261 case 'z': 1262 throw ex("parser.descape.5", this.offset-2); 1263 default: 1264 } 1265 return c; 1266 } 1267 1268 static private final int hexChar(int ch) { 1269 if (ch < '0') return -1; 1270 if (ch > 'f') return -1; 1271 if (ch <= '9') return ch-'0'; 1272 if (ch < 'A') return -1; 1273 if (ch <= 'F') return ch-'A'+10; 1274 if (ch < 'a') return -1; 1275 return ch-'a'+10; 1276 } 1277} 1278 | Popular Tags |