KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > python > modules > sre > SRE_STATE


1 /*
2  * Copyright 2000 Finn Bock
3  *
4  * This program contains material copyrighted by:
5  * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
6  *
7  * This version of the SRE library can be redistributed under CNRI's
8  * Python 1.6 license. For any other use, please contact Secret Labs
9  * AB (info@pythonware.com).
10  *
11  * Portions of this engine have been developed in cooperation with
12  * CNRI. Hewlett-Packard provided funding for 1.6 integration and
13  * other compatibility work.
14  */

15
16 // Last updated to _sre.c: 2.52
17

18 package org.python.modules.sre;
19
20 import java.util.*;
21
22 public class SRE_STATE {
23     /* illegal opcode */
24     public static final int SRE_ERROR_ILLEGAL = -1;
25
26     /* illegal state */
27     public static final int SRE_ERROR_STATE = -2;
28
29     /* runaway recursion */
30     public static final int SRE_ERROR_RECURSION_LIMIT = -3;
31
32     public static final int SRE_OP_FAILURE = 0;
33     public static final int SRE_OP_SUCCESS = 1;
34     public static final int SRE_OP_ANY = 2;
35     public static final int SRE_OP_ANY_ALL = 3;
36     public static final int SRE_OP_ASSERT = 4;
37     public static final int SRE_OP_ASSERT_NOT = 5;
38     public static final int SRE_OP_AT = 6;
39     public static final int SRE_OP_BRANCH = 7;
40     public static final int SRE_OP_CALL = 8;
41     public static final int SRE_OP_CATEGORY = 9;
42     public static final int SRE_OP_CHARSET = 10;
43     public static final int SRE_OP_BIGCHARSET = 11;
44     public static final int SRE_OP_GROUPREF = 12;
45     public static final int SRE_OP_GROUPREF_IGNORE = 13;
46     public static final int SRE_OP_IN = 14;
47     public static final int SRE_OP_IN_IGNORE = 15;
48     public static final int SRE_OP_INFO = 16;
49     public static final int SRE_OP_JUMP = 17;
50     public static final int SRE_OP_LITERAL = 18;
51     public static final int SRE_OP_LITERAL_IGNORE = 19;
52     public static final int SRE_OP_MARK = 20;
53     public static final int SRE_OP_MAX_UNTIL = 21;
54     public static final int SRE_OP_MIN_UNTIL = 22;
55     public static final int SRE_OP_NOT_LITERAL = 23;
56     public static final int SRE_OP_NOT_LITERAL_IGNORE = 24;
57     public static final int SRE_OP_NEGATE = 25;
58     public static final int SRE_OP_RANGE = 26;
59     public static final int SRE_OP_REPEAT = 27;
60     public static final int SRE_OP_REPEAT_ONE = 28;
61     public static final int SRE_OP_SUBPATTERN = 29;
62
63     public static final int SRE_AT_BEGINNING = 0;
64     public static final int SRE_AT_BEGINNING_LINE = 1;
65     public static final int SRE_AT_BEGINNING_STRING = 2;
66     public static final int SRE_AT_BOUNDARY = 3;
67     public static final int SRE_AT_NON_BOUNDARY = 4;
68     public static final int SRE_AT_END = 5;
69     public static final int SRE_AT_END_LINE = 6;
70     public static final int SRE_AT_END_STRING = 7;
71     public static final int SRE_AT_LOC_BOUNDARY = 8;
72     public static final int SRE_AT_LOC_NON_BOUNDARY = 9;
73     public static final int SRE_AT_UNI_BOUNDARY = 10;
74     public static final int SRE_AT_UNI_NON_BOUNDARY = 11;
75
76     public static final int SRE_CATEGORY_DIGIT = 0;
77     public static final int SRE_CATEGORY_NOT_DIGIT = 1;
78     public static final int SRE_CATEGORY_SPACE = 2;
79     public static final int SRE_CATEGORY_NOT_SPACE = 3;
80     public static final int SRE_CATEGORY_WORD = 4;
81     public static final int SRE_CATEGORY_NOT_WORD = 5;
82     public static final int SRE_CATEGORY_LINEBREAK = 6;
83     public static final int SRE_CATEGORY_NOT_LINEBREAK = 7;
84     public static final int SRE_CATEGORY_LOC_WORD = 8;
85     public static final int SRE_CATEGORY_LOC_NOT_WORD = 9;
86     public static final int SRE_CATEGORY_UNI_DIGIT = 10;
87     public static final int SRE_CATEGORY_UNI_NOT_DIGIT = 11;
88     public static final int SRE_CATEGORY_UNI_SPACE = 12;
89     public static final int SRE_CATEGORY_UNI_NOT_SPACE = 13;
90     public static final int SRE_CATEGORY_UNI_WORD = 14;
91     public static final int SRE_CATEGORY_UNI_NOT_WORD = 15;
92     public static final int SRE_CATEGORY_UNI_LINEBREAK = 16;
93     public static final int SRE_CATEGORY_UNI_NOT_LINEBREAK = 17;
94
95     public static final int SRE_FLAG_TEMPLATE = 1;
96     public static final int SRE_FLAG_IGNORECASE = 2;
97     public static final int SRE_FLAG_LOCALE = 4;
98     public static final int SRE_FLAG_MULTILINE = 8;
99     public static final int SRE_FLAG_DOTALL = 16;
100     public static final int SRE_FLAG_UNICODE = 32;
101     public static final int SRE_FLAG_VERBOSE = 64;
102
103     public static final int SRE_INFO_PREFIX = 1;
104     public static final int SRE_INFO_LITERAL = 2;
105     public static final int SRE_INFO_CHARSET = 4;
106
107
108     public static final int USE_RECURSION_LIMIT = 2000;
109
110
111     /* string pointers */
112     int ptr; /* current position (also end of current slice) */
113     int beginning; /* start of original string */
114     int start; /* start of current slice */
115     int end; /* end of original string */
116
117     /* attributes for the match object */
118     char[] str;
119     int pos;
120     int endpos;
121
122     /* character size */
123     int charsize;
124
125     /* registers */
126     int lastindex;
127     int lastmark;
128
129     /* FIXME: <fl> should be dynamically allocated! */
130     int[] mark = new int[200];
131
132     /* dynamically allocated stuff */
133     int[] mark_stack;
134     int mark_stack_size;
135     int mark_stack_base;
136
137     SRE_REPEAT repeat; /* current repeat context */
138
139     /* debugging */
140     int maxlevel;
141
142     /* duplicated from the PatternObject */
143     int flags;
144
145
146
147
148     public SRE_STATE(String JavaDoc str, int start, int end, int flags) {
149         this.str = str.toCharArray();
150         int size = str.length();
151
152         this.charsize = 1;
153
154         /* adjust boundaries */
155         if (start < 0)
156             start = 0;
157         else if (start > size)
158             start = size;
159
160         if (end < 0)
161             end = 0;
162         else if (end > size)
163             end = size;
164
165         this.start = start;
166         this.end = end;
167
168         this.pos = start;
169         this.endpos = end;
170
171         state_reset();
172
173         this.flags = flags;
174     }
175
176
177
178     private void mark_fini() {
179         mark_stack = null;
180         mark_stack_size = mark_stack_base = 0;
181     }
182
183
184     private void mark_save(int lo, int hi) {
185         if (hi <= lo)
186             return;
187
188         int size = (hi - lo) + 1;
189
190         int newsize = mark_stack_size;
191         int minsize = mark_stack_base + size;
192
193         int[] stack;
194
195         if (newsize < minsize) {
196             /* create new stack */
197             if (newsize == 0) {
198                 newsize = 512;
199                 if (newsize < minsize)
200                     newsize = minsize;
201                 //TRACE(0, ptr, "allocate stack " + newsize);
202
stack = new int[newsize];
203             } else {
204                 /* grow the stack */
205                 while (newsize < minsize)
206                     newsize += newsize;
207                 //TRACE(0, ptr, "grow stack to " + newsize);
208
stack = new int[newsize];
209                 System.arraycopy(mark_stack, 0, stack, 0, mark_stack.length);
210             }
211             mark_stack = stack;
212             mark_stack_size = newsize;
213         }
214
215         //TRACE(0, ptr, "copy " + lo + ":" + hi + " to " + mark_stack_base + " (" + size + ")");
216

217         System.arraycopy(mark, lo, mark_stack, mark_stack_base, size);
218
219         mark_stack_base += size;
220     }
221
222
223     private void mark_restore(int lo, int hi) {
224         if (hi <= lo)
225             return;
226
227         int size = (hi - lo) + 1;
228
229         mark_stack_base -= size;
230
231         //TRACE(0, ptr, "copy " + lo + ":" + hi + " from " + mark_stack_base);
232

233         System.arraycopy(mark_stack, mark_stack_base, mark, lo, size);
234     }
235
236
237
238     final boolean SRE_AT(int ptr, char at) {
239         /* check if pointer is at given position. return 1 if so, 0
240            otherwise */

241
242         boolean thiS, that;
243
244         switch (at) {
245         case SRE_AT_BEGINNING:
246         case SRE_AT_BEGINNING_STRING:
247             return ptr == beginning;
248
249         case SRE_AT_BEGINNING_LINE:
250             return (ptr == beginning || SRE_IS_LINEBREAK(str[ptr-1]));
251
252         case SRE_AT_END:
253             return (ptr+1 == end && SRE_IS_LINEBREAK(str[ptr])) || ptr == end;
254
255         case SRE_AT_END_LINE:
256             return ptr == end || SRE_IS_LINEBREAK(str[ptr]);
257
258         case SRE_AT_END_STRING:
259             return ptr == end;
260
261         case SRE_AT_BOUNDARY:
262             /* word boundary */
263             if (beginning == end)
264                 return false;
265             that = (ptr > beginning) ? SRE_IS_WORD(str[ptr-1]) : false;
266             thiS = (ptr < end) ? SRE_IS_WORD(str[ptr]) : false;
267             return thiS != that;
268
269         case SRE_AT_NON_BOUNDARY:
270             /* word non-boundary */
271             if (beginning == end)
272                 return false;
273             that = (ptr > beginning) ? SRE_IS_WORD(str[ptr-1]) : false;
274             thiS = (ptr < end) ? SRE_IS_WORD(str[ptr]) : false;
275             return thiS == that;
276
277         case SRE_AT_LOC_BOUNDARY:
278         case SRE_AT_UNI_BOUNDARY:
279             if (beginning == end)
280                 return false;
281             that = (ptr > beginning) ? SRE_LOC_IS_WORD(str[ptr-1]) : false;
282             thiS = (ptr < end) ? SRE_LOC_IS_WORD(str[ptr]) : false;
283             return thiS != that;
284
285         case SRE_AT_LOC_NON_BOUNDARY:
286         case SRE_AT_UNI_NON_BOUNDARY:
287             /* word non-boundary */
288             if (beginning == end)
289                 return false;
290             that = (ptr > beginning) ? SRE_LOC_IS_WORD(str[ptr-1]) : false;
291             thiS = (ptr < end) ? SRE_LOC_IS_WORD(str[ptr]) : false;
292             return thiS == that;
293         }
294
295         return false;
296     }
297
298
299
300
301     final boolean SRE_CHARSET(char[] set, int setidx, char ch) {
302         /* check if character is a member of the given set. return 1 if
303            so, 0 otherwise */

304
305         boolean ok = true;
306
307         for (;;) {
308             switch (set[setidx++]) {
309
310             case SRE_OP_LITERAL:
311                 //TRACE(setidx, ch, "CHARSET LITERAL " + (int) set[setidx]);
312
/* <LITERAL> <code> */
313                 if (ch == set[setidx])
314                     return ok;
315                 setidx++;
316                 break;
317
318             case SRE_OP_RANGE:
319                 /* <RANGE> <lower> <upper> */
320                 //TRACE(setidx, ch, "CHARSET RANGE " + (int) set[setidx] + " " + (int) set[setidx+1]);
321
if (set[setidx] <= ch && ch <= set[setidx+1])
322                     return ok;
323                 setidx += 2;
324                 break;
325
326             case SRE_OP_CHARSET:
327                 //TRACE(setidx, ch, "CHARSET CHARSET ");
328
/* <CHARSET> <bitmap> (16 bits per code word) */
329                 if (ch < 256 &&
330                             (set[setidx + (ch >> 4)] & (1 << (ch & 15))) != 0)
331                     return ok;
332                 setidx += 16;
333                 break;
334
335             case SRE_OP_BIGCHARSET:
336                 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
337                 //TRACE(setidx, ch, "CHARSET BIGCHARSET ");
338
int count = set[setidx++];
339                 int shift = ((ch >> 8) & 1) == 0 ? 8 : 0;
340                 int block = (set[setidx + (ch >> 8) / 2] >> shift) & 0xFF;
341                 setidx += 128;
342                 int idx = block*16 + ((ch & 255)>>4);
343                 if ((set[setidx + idx] & (1 << (ch & 15))) != 0)
344                     return ok;
345                 setidx += count*16;
346                 break;
347
348             case SRE_OP_CATEGORY:
349                 /* <CATEGORY> <code> */
350                 //TRACE(setidx, ch, "CHARSET CHARSET " + (int) set[setidx]);
351
if (sre_category(set[setidx], ch))
352                     return ok;
353                 setidx++;
354                 break;
355
356             case SRE_OP_NEGATE:
357                 //TRACE(setidx, ch, "CHARSET NEGATE");
358
ok = !ok;
359                 break;
360
361             case SRE_OP_FAILURE:
362                 //TRACE(setidx, ch, "CHARSET FAILURE");
363
return !ok;
364
365             default:
366                 /* internal error -- there's not much we can do about it
367                    here, so let's just pretend it didn't match... */

368                 return false;
369             }
370         }
371     }
372
373
374
375
376     private int SRE_COUNT(char[] pattern, int pidx, int maxcount, int level) {
377         char chr;
378         int ptr = this.ptr;
379         int end = this.end;
380         int i;
381
382         /* adjust end */
383         if (maxcount < end - ptr && maxcount != 65535)
384             end = ptr + maxcount;
385
386         switch (pattern[pidx]) {
387
388         case SRE_OP_ANY:
389             /* repeated dot wildcard. */
390             //TRACE(pidx, ptr, "COUNT ANY");
391
while (ptr < end && !SRE_IS_LINEBREAK(str[ptr]))
392                 ptr++;
393             break;
394
395         case SRE_OP_ANY_ALL:
396             /* repeated dot wildcare. skip to the end of the target
397                string, and backtrack from there */

398             //TRACE(pidx, ptr, "COUNT ANY_ALL");
399
ptr = end;
400             break;
401
402         case SRE_OP_LITERAL:
403             /* repeated literal */
404             chr = pattern[pidx+1];
405             //TRACE(pidx, ptr, "COUNT LITERAL " + (int) chr);
406
while (ptr < end && str[ptr] == chr)
407                 ptr++;
408             break;
409
410         case SRE_OP_LITERAL_IGNORE:
411             /* repeated literal */
412             chr = pattern[pidx+1];
413             //TRACE(pidx, ptr, "COUNT LITERAL_IGNORE " + (int) chr);
414
while (ptr < end && lower(str[ptr]) == chr)
415                 ptr++;
416             break;
417
418         case SRE_OP_NOT_LITERAL:
419             /* repeated non-literal */
420             chr = pattern[pidx+1];
421             //TRACE(pidx, ptr, "COUNT NOT_LITERAL " + (int) chr);
422
while (ptr < end && str[ptr] != chr)
423                 ptr++;
424             break;
425
426         case SRE_OP_NOT_LITERAL_IGNORE:
427             /* repeated non-literal */
428             chr = pattern[pidx+1];
429             //TRACE(pidx, ptr, "COUNT NOT_LITERAL_IGNORE " + (int) chr);
430
while (ptr < end && lower(str[ptr]) != chr)
431                 ptr++;
432             break;
433
434         case SRE_OP_IN:
435             /* repeated set */
436             //TRACE(pidx, ptr, "COUNT IN");
437
while (ptr < end && SRE_CHARSET(pattern, pidx + 2, str[ptr]))
438                 ptr++;
439             break;
440
441         default:
442             /* repeated single character pattern */
443             //TRACE(pidx, ptr, "COUNT SUBPATTERN");
444
while (this.ptr < end) {
445                 i = SRE_MATCH(pattern, pidx, level);
446                 if (i < 0)
447                     return i;
448                 if (i == 0)
449                     break;
450             }
451             return this.ptr - ptr;
452         }
453
454         return ptr - this.ptr;
455     }
456
457
458
459     final int SRE_MATCH(char[] pattern, int pidx, int level) {
460         /* check if string matches the given pattern. returns <0 for
461            error, 0 for failure, and 1 for success */

462
463         int end = this.end;
464         int ptr = this.ptr;
465         int i, count;
466         char chr;
467
468         int lastmark;
469
470         //TRACE(pidx, ptr, "ENTER " + level);
471

472         if (level > USE_RECURSION_LIMIT)
473            return SRE_ERROR_RECURSION_LIMIT;
474
475         if (pattern[pidx] == SRE_OP_INFO) {
476             /* optimization info block */
477             /* args: <1=skip> <2=flags> <3=min> ... */
478             if (pattern[pidx+3] != 0 && (end - ptr) < pattern[pidx+3]) {
479                 return 0;
480             }
481             pidx += pattern[pidx+1] + 1;
482         }
483
484
485         for (;;) {
486
487             switch (pattern[pidx++]) {
488
489             case SRE_OP_FAILURE:
490                 /* immediate failure */
491                 //TRACE(pidx, ptr, "FAILURE");
492
return 0;
493
494             case SRE_OP_SUCCESS:
495                 /* end of pattern */
496                 //TRACE(pidx, ptr, "SUCCESS");
497
this.ptr = ptr;
498                 return 1;
499
500             case SRE_OP_AT:
501                 /* match at given position */
502                 /* <AT> <code> */
503                 //TRACE(pidx, ptr, "AT " + (int) pattern[pidx]);
504
if (!SRE_AT(ptr, pattern[pidx]))
505                     return 0;
506                 pidx++;
507                 break;
508
509             case SRE_OP_CATEGORY:
510                 /* match at given category */
511                 /* <CATEGORY> <code> */
512                 //TRACE(pidx, ptr, "CATEGORY " + (int)pattern[pidx]);
513

514                 if (ptr >= end || !sre_category(pattern[pidx], str[ptr]))
515                     return 0;
516
517                 pidx++;
518                 ptr++;
519                 break;
520
521             case SRE_OP_LITERAL:
522                 /* match literal character */
523                 /* <LITERAL> <code> */
524                 //TRACE(pidx, ptr, "LITERAL " + (int) pattern[pidx]);
525

526                 if (ptr >= end || str[ptr] != pattern[pidx])
527                     return 0;
528                 pidx++;
529                 ptr++;
530                 break;
531
532             case SRE_OP_NOT_LITERAL:
533                 /* match anything that is not literal character */
534                 /* args: <code> */
535                 //TRACE(pidx, ptr, "NOT_LITERAL " + (int) pattern[pidx]);
536
if (ptr >= end || str[ptr] == pattern[pidx])
537                     return 0;
538                 pidx++;
539                 ptr++;
540                 break;
541
542             case SRE_OP_ANY:
543                 /* match anything */
544                 //TRACE(pidx, ptr, "ANY");
545
if (ptr >= end || SRE_IS_LINEBREAK(str[ptr]))
546                     return 0;
547                 ptr++;
548                 break;
549
550             case SRE_OP_ANY_ALL:
551                 /* match anything */
552                 /* <ANY_ALL> */
553                 //TRACE(pidx, ptr, "ANY_ALL");
554
if (ptr >= end)
555                     return 0;
556                 ptr++;
557                 break;
558
559             case SRE_OP_IN:
560                 /* match set member (or non_member) */
561                 /* <IN> <skip> <set> */
562                 //TRACE(pidx, ptr, "IN");
563
if (ptr >= end || !SRE_CHARSET(pattern, pidx + 1, str[ptr]))
564                     return 0;
565                 pidx += (int)pattern[pidx];
566                 ptr++;
567                 break;
568
569             case SRE_OP_GROUPREF:
570                 /* match backreference */
571                 i = pattern[pidx];
572                 //TRACE(pidx, ptr, "GROUPREF " + i);
573
int p = mark[i+i];
574                 int e = mark[i+i+1];
575                 if (p == -1 || e == -1 || e < p)
576                     return 0;
577                 while (p < e) {
578                     if (ptr >= end || str[ptr] != str[p])
579                         return 0;
580                     p++;
581                     ptr++;
582                 }
583                 pidx++;
584                 break;
585
586             case SRE_OP_GROUPREF_IGNORE:
587                 /* match backreference */
588                 i = pattern[pidx];
589                 //TRACE(pidx, ptr, "GROUPREF_IGNORE " + i);
590
p = mark[i+i];
591                 e = mark[i+i+1];
592                 if (p == -1 || e == -1 || e < p)
593                     return 0;
594                 while (p < e) {
595                     if (ptr >= end || lower(str[ptr]) != lower(str[p]))
596                         return 0;
597                     p++;
598                     ptr++;
599                 }
600                 pidx++;
601                 break;
602
603             case SRE_OP_LITERAL_IGNORE:
604                 //TRACE(pidx, ptr, "LITERAL_IGNORE " + (int) pattern[pidx]);
605
if (ptr >= end || lower(str[ptr]) != lower(pattern[pidx]))
606                     return 0;
607                 pidx++;
608                 ptr++;
609                 break;
610
611             case SRE_OP_NOT_LITERAL_IGNORE:
612                 //TRACE(pidx, ptr, "NOT_LITERAL_IGNORE " + (int) pattern[pidx]);
613
if (ptr >= end || lower(str[ptr]) == lower(pattern[pidx]))
614                     return 0;
615                 pidx++;
616                 ptr++;
617                 break;
618
619             case SRE_OP_IN_IGNORE:
620                 //TRACE(pidx, ptr, "IN_IGNORE");
621
if (ptr >= end ||
622                         !SRE_CHARSET(pattern, pidx + 1, lower(str[ptr])))
623                     return 0;
624                 pidx += (int)pattern[pidx];
625                 ptr++;
626                 break;
627
628             case SRE_OP_MARK:
629                 /* set mark */
630                 /* <MARK> <gid> */
631                 //TRACE(pidx, ptr, "MARK " + (int) pattern[pidx]);
632
i = pattern[pidx];
633                 if ((i & 1) != 0)
634                     lastindex = i / 2 + 1;
635                 if (i > this.lastmark)
636                     this.lastmark = i;
637                 mark[i] = ptr;
638                 pidx++;
639                 break;
640
641             case SRE_OP_JUMP:
642             case SRE_OP_INFO:
643                 /* jump forward */
644                 /* <JUMP> <offset> */
645                 //TRACE(pidx, ptr, "JUMP " + (int) pattern[pidx]);
646
pidx += (int)pattern[pidx];
647                 break;
648
649             case SRE_OP_ASSERT:
650                 /* assert subpattern */
651                 /* args: <skip> <back> <pattern> */
652                 //TRACE(pidx, ptr, "ASSERT " + (int) pattern[pidx+1]);
653

654                 this.ptr = ptr - pattern[pidx + 1];
655                 if (this.ptr < this.beginning)
656                     return 0;
657                 i = SRE_MATCH(pattern, pidx + 2, level + 1);
658                 if (i <= 0)
659                     return i;
660                 pidx += pattern[pidx];
661                 break;
662
663             case SRE_OP_ASSERT_NOT:
664                 /* assert not subpattern */
665                 /* args: <skip> <pattern> */
666                 //TRACE(pidx, ptr, "ASSERT_NOT " + (int) pattern[pidx]);
667
this.ptr = ptr - pattern[pidx + 1];
668                 if (this.ptr >= this.beginning) {
669                     i = SRE_MATCH(pattern, pidx + 2, level + 1);
670                     if (i < 0)
671                         return i;
672                     if (i != 0)
673                         return 0;
674                 }
675                 pidx += pattern[pidx];
676                 break;
677
678             case SRE_OP_BRANCH:
679                 /* try an alternate branch */
680                 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
681                 //TRACE(pidx, ptr, "BRANCH");
682
lastmark = this.lastmark;
683                 for (; pattern[pidx] != 0; pidx += pattern[pidx]) {
684                     if (pattern[pidx+1] == SRE_OP_LITERAL &&
685                         (ptr >= end || str[ptr] != pattern[pidx+2]))
686                         continue;
687                     if (pattern[pidx+1] == SRE_OP_IN && (ptr >= end ||
688                                 !SRE_CHARSET(pattern, pidx + 3, str[ptr])))
689                         continue;
690                     this.ptr = ptr;
691                     i = SRE_MATCH(pattern, pidx + 1, level + 1);
692                     if (i != 0)
693                         return i;
694                     while (this.lastmark > lastmark)
695                         mark[this.lastmark--] = -1;
696                 }
697
698                 return 0;
699
700             case SRE_OP_REPEAT_ONE:
701                 /* match repeated sequence (maximizing regexp) */
702
703                 /* this operator only works if the repeated item is
704                    exactly one character wide, and we're not already
705                    collecting backtracking points. for other cases,
706                    use the MAX_REPEAT operator */

707
708                 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
709
710                 int mincount = pattern[pidx+1];
711
712                 //TRACE(pidx, ptr, "REPEAT_ONE " + mincount + " " + (int)pattern[pidx+2]);
713
if (ptr + mincount > end)
714                     return 0; /* cannot match */
715
716                 this.ptr = ptr;
717
718                 count = SRE_COUNT(pattern, pidx + 3, pattern[pidx+2],
719                                   level + 1);
720                 if (count < 0)
721                     return count;
722
723                 ptr += count;
724
725                 /* when we arrive here, count contains the number of
726                    matches, and ptr points to the tail of the target
727                    string. check if the rest of the pattern matches,
728                    and backtrack if not. */

729
730                 if (count < mincount)
731                     return 0;
732
733                 if (pattern[pidx + pattern[pidx]] == SRE_OP_SUCCESS) {
734                     /* tail is empty. we're finished */
735                     this.ptr = ptr;
736                     return 1;
737
738                 } else if (pattern[pidx + pattern[pidx]] == SRE_OP_LITERAL) {
739                     /* tail starts with a literal. skip positions where
740                        the rest of the pattern cannot possibly match */

741                     chr = pattern[pidx + pattern[pidx]+1];
742                     for (;;) {
743                         while (count >= mincount &&
744                                (ptr >= end || str[ptr] != chr)) {
745                             ptr--;
746                             count--;
747                         }
748                         if (count < mincount)
749                             break;
750                         this.ptr = ptr;
751                         i = SRE_MATCH(pattern, pidx + pattern[pidx],
752                                      level + 1);
753                         if (i != 0)
754                             return 1;
755                         ptr--;
756                         count--;
757                     }
758
759                 } else {
760                     /* general case */
761                     lastmark = this.lastmark;
762                     while (count >= mincount) {
763                         this.ptr = ptr;
764                         i = SRE_MATCH(pattern, pidx + pattern[pidx],
765                                       level + 1);
766                         if (i != 0)
767                             return i;
768                         ptr--;
769                         count--;
770                         while (this.lastmark > lastmark)
771                             mark[this.lastmark--] = -1;
772                     }
773                 }
774                 return 0;
775
776
777             case SRE_OP_REPEAT:
778                 /* create repeat context. all the hard work is done
779                    by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */

780                 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
781
782                 //TRACE(pidx, ptr, "REPEAT " + (int)pattern[pidx+1] + " " + (int)pattern[pidx+2]);
783

784                 SRE_REPEAT rep = new SRE_REPEAT(repeat);
785                 rep.count = -1;
786                 rep.pidx = pidx;
787                 repeat = rep;
788
789                 this.ptr = ptr;
790                 i = SRE_MATCH(pattern, pidx + pattern[pidx], level + 1);
791
792                 repeat = rep.prev;
793                 return i;
794
795
796
797             case SRE_OP_MAX_UNTIL:
798                 /* maximizing repeat */
799                 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
800
801                 /* FIXME: we probably need to deal with zero-width
802                    matches in here... */

803
804                 SRE_REPEAT rp = this.repeat;
805                 if (rp == null)
806                     return SRE_ERROR_STATE;
807
808                 this.ptr = ptr;
809
810                 count = rp.count + 1;
811
812                 //TRACE(pidx, ptr, "MAX_UNTIL " + count);
813

814                 if (count < pattern[rp.pidx + 1]) {
815                     /* not enough matches */
816
817                     rp.count = count;
818                     i = SRE_MATCH(pattern, rp.pidx + 3, level + 1);
819                     if (i != 0)
820                         return i;
821                     rp.count = count - 1;
822                     this.ptr = ptr;
823                     return 0;
824                 }
825
826                 if (count < pattern[rp.pidx+2] ||
827                                             pattern[rp.pidx+2] == 65535) {
828                     /* we may have enough matches, but if we can
829                        match another item, do so */

830                     rp.count = count;
831                     lastmark = this.lastmark;
832                     mark_save(0, lastmark);
833                     /* RECURSIVE */
834                     i = SRE_MATCH(pattern, rp.pidx + 3, level + 1);
835                     if (i != 0)
836                         return i;
837                     mark_restore(0, lastmark);
838                     this.lastmark = lastmark;
839                     rp.count = count - 1;
840                     this.ptr = ptr;
841                 }
842
843                 /* cannot match more repeated items here. make sure the
844                    tail matches */

845                 this.repeat = rp.prev;
846                 /* RECURSIVE */
847                 i = SRE_MATCH(pattern, pidx, level + 1);
848                 if (i != 0)
849                     return i;
850                 this.repeat = rp;
851                 this.ptr = ptr;
852                 return 0;
853
854             case SRE_OP_MIN_UNTIL:
855                 /* minimizing repeat */
856                 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
857
858                 rp = this.repeat;
859                 if (rp == null)
860                     return SRE_ERROR_STATE;
861
862                 count = rp.count + 1;
863
864                 //TRACE(pidx, ptr, "MIN_UNTIL " + count + " " + rp.pidx);
865

866                 this.ptr = ptr;
867
868                 if (count < pattern[rp.pidx + 1]) {
869                     /* not enough matches */
870                     rp.count = count;
871                     /* RECURSIVE */
872                     i = SRE_MATCH(pattern, rp.pidx + 3, level + 1);
873                     if (i != 0)
874                         return i;
875                     rp.count = count-1;
876                     this.ptr = ptr;
877                     return 0;
878                 }
879
880                 /* see if the tail matches */
881                 this.repeat = rp.prev;
882                 i = SRE_MATCH(pattern, pidx, level + 1);
883                 if (i != 0)
884                     return i;
885
886                 this.ptr = ptr;
887                 this.repeat = rp;
888
889                 if (count >= pattern[rp.pidx+2] &&
890                                                 pattern[rp.pidx+2] != 65535)
891                     return 0;
892
893                 rp.count = count;
894                 /* RECURSIVE */
895                 i = SRE_MATCH(pattern, rp.pidx + 3, level + 1);
896                 if (i != 0)
897                     return i;
898                 rp.count = count - 1;
899                 this.ptr = ptr;
900                 return 0;
901
902
903             default:
904                 //TRACE(pidx, ptr, "UNKNOWN " + (int) pattern[pidx-1]);
905
return SRE_ERROR_ILLEGAL;
906             }
907         }
908
909         //return SRE_ERROR_ILLEGAL;
910
}
911
912
913
914
915     int SRE_SEARCH(char[] pattern, int pidx) {
916         int ptr = this.start;
917         int end = this.end;
918         int status = 0;
919         int prefix_len = 0;
920         int prefix_skip = 0;
921         int prefix = 0;
922         int charset = 0;
923         int overlap = 0;
924         int flags = 0;
925
926         if (pattern[pidx] == SRE_OP_INFO) {
927             /* optimization info block */
928             /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
929
930             flags = pattern[pidx+2];
931
932             if (pattern[pidx+3] > 0) {
933                 /* adjust end point (but make sure we leave at least one
934                    character in there, so literal search will work) */

935                 end -= pattern[pidx+3]-1;
936                 if (end <= ptr)
937                     end = ptr; // FBO
938
}
939
940             if ((flags & SRE_INFO_PREFIX) != 0) {
941                 /* pattern starts with a known prefix */
942                 /* <length> <skip> <prefix data> <overlap data> */
943                 prefix_len = pattern[pidx+5];
944                 prefix_skip = pattern[pidx+6];
945                 prefix = pidx + 7;
946                 overlap = prefix + prefix_len - 1;
947             } else if ((flags & SRE_INFO_CHARSET) != 0) {
948                 /* pattern starts with a character from a known set */
949                 /* <charset> */
950                 charset = pidx + 5;
951             }
952
953             pidx += 1 + pattern[pidx+1];
954         }
955
956
957         if (prefix_len > 1) {
958             /* pattern starts with a known prefix. use the overlap
959                table to skip forward as fast as we possibly can */

960             int i = 0;
961             end = this.end;
962             while (ptr < end) {
963                 for (;;) {
964                     if (str[ptr] != pattern[prefix+i]) {
965                         if (i == 0)
966                             break;
967                         else
968                             i = pattern[overlap+i];
969                     } else {
970                         if (++i == prefix_len) {
971                             /* found a potential match */
972                             //TRACE(pidx, ptr, "SEARCH SCAN " + prefix_skip + " " + prefix_len);
973
this.start = ptr + 1 - prefix_len;
974                             this.ptr = ptr + 1 - prefix_len + prefix_skip;
975                             if ((flags & SRE_INFO_LITERAL) != 0)
976                                 return 1; /* we got all of it */
977                             status = SRE_MATCH(pattern,
978                                                pidx + 2*prefix_skip, 1);
979                             if (status != 0)
980                                 return status;
981                             /* close but no cigar -- try again */
982                             i = pattern[overlap + i];
983                         }
984                         break;
985                     }
986
987                 }
988                 ptr++;
989             }
990             return 0;
991         }
992
993         if (pattern[pidx] == SRE_OP_LITERAL) {
994             /* pattern starts with a literal */
995             char chr = pattern[pidx + 1];
996             end = this.end;
997             for (;;) {
998                 while (ptr < end && str[ptr] != chr)
999                     ptr++;
1000                if (ptr == end)
1001                    return 0;
1002                //TRACE(pidx, ptr, "SEARCH LITERAL");
1003
this.start = ptr;
1004                this.ptr = ++ptr;
1005                if ((flags & SRE_INFO_LITERAL) != 0)
1006                    return 1;
1007                status = SRE_MATCH(pattern, pidx + 2, 1);
1008                if (status != 0)
1009                    break;
1010            }
1011
1012        } else if (charset != 0) {
1013            /* pattern starts with a character from a known set */
1014            end = this.end;
1015            for (;;) {
1016                while (ptr < end && !SRE_CHARSET(pattern, charset, str[ptr]))
1017                    ptr++;
1018                if (ptr == end)
1019                    return 0;
1020                //TRACE(pidx, ptr, "SEARCH CHARSET");
1021
this.start = ptr;
1022                this.ptr = ptr;
1023                status = SRE_MATCH(pattern, pidx, 1);
1024                if (status != 0)
1025                    break;
1026                ptr++;
1027            }
1028
1029        } else {
1030            /* general case */
1031            while (ptr <= end) {
1032                //TRACE(pidx, ptr, "SEARCH");
1033
this.start = this.ptr = ptr++;
1034                status = SRE_MATCH(pattern, pidx, 1);
1035                if (status != 0)
1036                    break;
1037            }
1038        }
1039
1040        return status;
1041    }
1042
1043
1044    final boolean sre_category(char category, char ch) {
1045        switch (category) {
1046
1047        case SRE_CATEGORY_DIGIT:
1048            return SRE_IS_DIGIT(ch);
1049        case SRE_CATEGORY_NOT_DIGIT:
1050            return ! SRE_IS_DIGIT(ch);
1051
1052        case SRE_CATEGORY_SPACE:
1053            return SRE_IS_SPACE(ch);
1054        case SRE_CATEGORY_NOT_SPACE:
1055            return ! SRE_IS_SPACE(ch);
1056
1057        case SRE_CATEGORY_WORD:
1058            return SRE_IS_WORD(ch);
1059        case SRE_CATEGORY_NOT_WORD:
1060            return ! SRE_IS_WORD(ch);
1061
1062        case SRE_CATEGORY_LINEBREAK:
1063            return SRE_IS_LINEBREAK(ch);
1064        case SRE_CATEGORY_NOT_LINEBREAK:
1065            return ! SRE_IS_LINEBREAK(ch);
1066
1067        case SRE_CATEGORY_LOC_WORD:
1068            return SRE_LOC_IS_WORD(ch);
1069        case SRE_CATEGORY_LOC_NOT_WORD:
1070            return ! SRE_LOC_IS_WORD(ch);
1071
1072
1073        case SRE_CATEGORY_UNI_DIGIT:
1074            return Character.isDigit(ch);
1075        case SRE_CATEGORY_UNI_NOT_DIGIT:
1076            return !Character.isDigit(ch);
1077
1078        case SRE_CATEGORY_UNI_SPACE:
1079            return Character.isWhitespace(ch);
1080        case SRE_CATEGORY_UNI_NOT_SPACE:
1081            return !Character.isWhitespace(ch);
1082
1083        case SRE_CATEGORY_UNI_WORD:
1084            return Character.isLetterOrDigit(ch) || ch == '_';
1085        case SRE_CATEGORY_UNI_NOT_WORD:
1086            return ! (Character.isLetterOrDigit(ch) || ch == '_');
1087
1088        case SRE_CATEGORY_UNI_LINEBREAK:
1089           return SRE_UNI_IS_LINEBREAK(ch);
1090        case SRE_CATEGORY_UNI_NOT_LINEBREAK:
1091           return ! SRE_UNI_IS_LINEBREAK(ch);
1092
1093        }
1094        return false;
1095    }
1096
1097
1098    /* default character predicates (run sre_chars.py to regenerate tables) */
1099
1100    static final int SRE_DIGIT_MASK = 1;
1101    static final int SRE_SPACE_MASK = 2;
1102    static final int SRE_LINEBREAK_MASK = 4;
1103    static final int SRE_ALNUM_MASK = 8;
1104    static final int SRE_WORD_MASK = 16;
1105
1106    static byte[] sre_char_info = new byte[] {
1107        0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1108        2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1109        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
1110        25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
1111        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1112        0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
1113        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
1114
1115    static byte[] sre_char_lower = new byte[] {
1116        0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
1117        10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
1118        27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
1119        44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
1120        61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
1121        108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
1122        122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
1123        106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
1124        120, 121, 122, 123, 124, 125, 126, 127 };
1125
1126    final boolean SRE_IS_DIGIT(char ch) {
1127        return ((ch) < 128 ?
1128                (sre_char_info[(ch)] & SRE_DIGIT_MASK) != 0 : false);
1129    }
1130
1131    final boolean SRE_IS_SPACE(char ch) {
1132        return ((ch) < 128 ?
1133                (sre_char_info[(ch)] & SRE_SPACE_MASK) != 0 : false);
1134    }
1135
1136    final boolean SRE_IS_WORD(char ch) {
1137        return ((ch) < 128 ?
1138                (sre_char_info[(ch)] & SRE_WORD_MASK) != 0 : false);
1139    }
1140
1141    final boolean SRE_IS_LINEBREAK(char ch) {
1142        return ch == '\n';
1143    }
1144
1145    final boolean SRE_LOC_IS_WORD(char ch) {
1146        return Character.isLetterOrDigit(ch) || ch == '_';
1147    }
1148
1149    final boolean SRE_UNI_IS_LINEBREAK(char ch) {
1150        switch (ch) {
1151        case 0x000A: /* LINE FEED */
1152        case 0x000D: /* CARRIAGE RETURN */
1153        case 0x001C: /* FILE SEPARATOR */
1154        case 0x001D: /* GROUP SEPARATOR */
1155        case 0x001E: /* RECORD SEPARATOR */
1156        case 0x0085: /* NEXT LINE */
1157        case 0x2028: /* LINE SEPARATOR */
1158        case 0x2029: /* PARAGRAPH SEPARATOR */
1159            return true;
1160        default:
1161            return false;
1162        }
1163    }
1164
1165
1166    final char lower(char ch) {
1167        if ((flags & SRE_FLAG_LOCALE) != 0)
1168             return ((ch) < 256 ? Character.toLowerCase(ch) : ch);
1169        if ((flags & SRE_FLAG_UNICODE) != 0)
1170             return Character.toLowerCase(ch);
1171        return ((ch) < 128 ? (char)sre_char_lower[ch] : ch);
1172    }
1173
1174
1175    public static int getlower(int ch, int flags) {
1176        if ((flags & SRE_FLAG_LOCALE) != 0)
1177             return ((ch) < 256 ? Character.toLowerCase((char) ch) : ch);
1178        if ((flags & SRE_FLAG_UNICODE) != 0)
1179             return Character.toLowerCase((char)ch);
1180        return ((ch) < 128 ? (char)sre_char_lower[ch] : ch);
1181    }
1182
1183
1184
1185
1186
1187    String JavaDoc getslice(int index, String JavaDoc string, boolean empty) {
1188        int i, j;
1189
1190        index = (index - 1) * 2;
1191
1192        if (string == null || mark[index] == -1 || mark[index+1] == -1) {
1193            if (empty) {
1194                /* want empty string */
1195                i = j = 0;
1196            } else {
1197                return null;
1198            }
1199        } else {
1200            i = mark[index];
1201            j = mark[index+1];
1202        }
1203
1204        return string.substring(i, j);
1205    }
1206
1207
1208
1209
1210    void state_reset() {
1211        lastmark = 0;
1212
1213        /* FIXME: dynamic! */
1214        for (int i = 0; i < mark.length; i++)
1215            mark[i] = -1;
1216
1217        lastindex = -1;
1218        repeat = null;
1219
1220        mark_fini();
1221    }
1222
1223
1224    private void TRACE(int pidx, int ptr, String JavaDoc string) {
1225        //System.out.println(" |" + pidx + "|" + ptr + ": " + string);
1226
}
1227}
1228
Popular Tags