1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16#ifdef SLRE_TEST
17#include <stdio.h>
18#include <assert.h>
19#include <ctype.h>
20#include <stdlib.h>
21#include <string.h>
22#else
23#include <common.h>
24#include <linux/ctype.h>
25#endif
26
27#include <errno.h>
28
29#include <slre.h>
30
31enum {END, BRANCH, ANY, EXACT, ANYOF, ANYBUT, OPEN, CLOSE, BOL, EOL,
32 STAR, PLUS, STARQ, PLUSQ, QUEST, SPACE, NONSPACE, DIGIT};
33
34#ifdef SLRE_TEST
35static struct {
36 const char *name;
37 int narg;
38 const char *flags;
39} opcodes[] = {
40 {"END", 0, ""},
41 {"BRANCH", 2, "oo"},
42 {"ANY", 0, ""},
43 {"EXACT", 2, "d"},
44 {"ANYOF", 2, "D"},
45 {"ANYBUT", 2, "D"},
46 {"OPEN ", 1, "i"},
47 {"CLOSE", 1, "i"},
48 {"BOL", 0, ""},
49 {"EOL", 0, ""},
50 {"STAR", 1, "o"},
51 {"PLUS", 1, "o"},
52 {"STARQ", 1, "o"},
53 {"PLUSQ", 1, "o"},
54 {"QUEST", 1, "o"},
55 {"SPACE", 0, ""},
56 {"NONSPACE", 0, ""},
57 {"DIGIT", 0, ""}
58};
59#endif
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92static const char *meta_chars = "|.^$*+?()[\\";
93
94#ifdef SLRE_TEST
95
96static void
97print_character_set(FILE *fp, const unsigned char *p, int len)
98{
99 int i;
100
101 for (i = 0; i < len; i++) {
102 if (i > 0)
103 (void) fputc(',', fp);
104 if (p[i] == 0) {
105 i++;
106 if (p[i] == 0)
107 (void) fprintf(fp, "\\x%02x", p[i]);
108 else
109 (void) fprintf(fp, "%s", opcodes[p[i]].name);
110 } else if (isprint(p[i])) {
111 (void) fputc(p[i], fp);
112 } else {
113 (void) fprintf(fp, "\\x%02x", p[i]);
114 }
115 }
116}
117
118void
119slre_dump(const struct slre *r, FILE *fp)
120{
121 int i, j, ch, op, pc;
122
123 for (pc = 0; pc < r->code_size; pc++) {
124
125 op = r->code[pc];
126 (void) fprintf(fp, "%3d %s ", pc, opcodes[op].name);
127
128 for (i = 0; opcodes[op].flags[i] != '\0'; i++)
129 switch (opcodes[op].flags[i]) {
130 case 'i':
131 (void) fprintf(fp, "%d ", r->code[pc + 1]);
132 pc++;
133 break;
134 case 'o':
135 (void) fprintf(fp, "%d ",
136 pc + r->code[pc + 1] - i);
137 pc++;
138 break;
139 case 'D':
140 print_character_set(fp, r->data +
141 r->code[pc + 1], r->code[pc + 2]);
142 pc += 2;
143 break;
144 case 'd':
145 (void) fputc('"', fp);
146 for (j = 0; j < r->code[pc + 2]; j++) {
147 ch = r->data[r->code[pc + 1] + j];
148 if (isprint(ch)) {
149 (void) fputc(ch, fp);
150 } else {
151 (void) fprintf(fp,
152 "\\x%02x", ch);
153 }
154 }
155 (void) fputc('"', fp);
156 pc += 2;
157 break;
158 }
159
160 (void) fputc('\n', fp);
161 }
162}
163#endif
164
165static void
166set_jump_offset(struct slre *r, int pc, int offset)
167{
168 assert(offset < r->code_size);
169
170 if (r->code_size - offset > 0xff)
171 r->err_str = "Jump offset is too big";
172 else
173 r->code[pc] = (unsigned char) (r->code_size - offset);
174}
175
176static void
177emit(struct slre *r, int code)
178{
179 if (r->code_size >= (int) (sizeof(r->code) / sizeof(r->code[0])))
180 r->err_str = "RE is too long (code overflow)";
181 else
182 r->code[r->code_size++] = (unsigned char) code;
183}
184
185static void
186store_char_in_data(struct slre *r, int ch)
187{
188 if (r->data_size >= (int) sizeof(r->data))
189 r->err_str = "RE is too long (data overflow)";
190 else
191 r->data[r->data_size++] = ch;
192}
193
194static void
195exact(struct slre *r, const char **re)
196{
197 int old_data_size = r->data_size;
198
199 while (**re != '\0' && (strchr(meta_chars, **re)) == NULL)
200 store_char_in_data(r, *(*re)++);
201
202 emit(r, EXACT);
203 emit(r, old_data_size);
204 emit(r, r->data_size - old_data_size);
205}
206
207static int
208get_escape_char(const char **re)
209{
210 int res;
211
212 switch (*(*re)++) {
213 case 'n':
214 res = '\n';
215 break;
216 case 'r':
217 res = '\r';
218 break;
219 case 't':
220 res = '\t';
221 break;
222 case '0':
223 res = 0;
224 break;
225 case 'S':
226 res = NONSPACE << 8;
227 break;
228 case 's':
229 res = SPACE << 8;
230 break;
231 case 'd':
232 res = DIGIT << 8;
233 break;
234 default:
235 res = (*re)[-1];
236 break;
237 }
238
239 return res;
240}
241
242static void
243anyof(struct slre *r, const char **re)
244{
245 int esc, old_data_size = r->data_size, op = ANYOF;
246
247 if (**re == '^') {
248 op = ANYBUT;
249 (*re)++;
250 }
251
252 while (**re != '\0')
253
254 switch (*(*re)++) {
255 case ']':
256 emit(r, op);
257 emit(r, old_data_size);
258 emit(r, r->data_size - old_data_size);
259 return;
260
261 break;
262 case '\\':
263 esc = get_escape_char(re);
264 if ((esc & 0xff) == 0) {
265 store_char_in_data(r, 0);
266 store_char_in_data(r, esc >> 8);
267 } else {
268 store_char_in_data(r, esc);
269 }
270 break;
271 default:
272 store_char_in_data(r, (*re)[-1]);
273 break;
274 }
275
276 r->err_str = "No closing ']' bracket";
277}
278
279static void
280relocate(struct slre *r, int begin, int shift)
281{
282 emit(r, END);
283 memmove(r->code + begin + shift, r->code + begin, r->code_size - begin);
284 r->code_size += shift;
285}
286
287static void
288quantifier(struct slre *r, int prev, int op)
289{
290 if (r->code[prev] == EXACT && r->code[prev + 2] > 1) {
291 r->code[prev + 2]--;
292 emit(r, EXACT);
293 emit(r, r->code[prev + 1] + r->code[prev + 2]);
294 emit(r, 1);
295 prev = r->code_size - 3;
296 }
297 relocate(r, prev, 2);
298 r->code[prev] = op;
299 set_jump_offset(r, prev + 1, prev);
300}
301
302static void
303exact_one_char(struct slre *r, int ch)
304{
305 emit(r, EXACT);
306 emit(r, r->data_size);
307 emit(r, 1);
308 store_char_in_data(r, ch);
309}
310
311static void
312fixup_branch(struct slre *r, int fixup)
313{
314 if (fixup > 0) {
315 emit(r, END);
316 set_jump_offset(r, fixup, fixup - 2);
317 }
318}
319
320static void
321compile(struct slre *r, const char **re)
322{
323 int op, esc, branch_start, last_op, fixup, cap_no, level;
324
325 fixup = 0;
326 level = r->num_caps;
327 branch_start = last_op = r->code_size;
328
329 for (;;)
330 switch (*(*re)++) {
331 case '\0':
332 (*re)--;
333 return;
334
335 break;
336 case '^':
337 emit(r, BOL);
338 break;
339 case '$':
340 emit(r, EOL);
341 break;
342 case '.':
343 last_op = r->code_size;
344 emit(r, ANY);
345 break;
346 case '[':
347 last_op = r->code_size;
348 anyof(r, re);
349 break;
350 case '\\':
351 last_op = r->code_size;
352 esc = get_escape_char(re);
353 if (esc & 0xff00)
354 emit(r, esc >> 8);
355 else
356 exact_one_char(r, esc);
357 break;
358 case '(':
359 last_op = r->code_size;
360 cap_no = ++r->num_caps;
361 emit(r, OPEN);
362 emit(r, cap_no);
363
364 compile(r, re);
365 if (*(*re)++ != ')') {
366 r->err_str = "No closing bracket";
367 return;
368 }
369
370 emit(r, CLOSE);
371 emit(r, cap_no);
372 break;
373 case ')':
374 (*re)--;
375 fixup_branch(r, fixup);
376 if (level == 0) {
377 r->err_str = "Unbalanced brackets";
378 return;
379 }
380 return;
381
382 break;
383 case '+':
384 case '*':
385 op = (*re)[-1] == '*' ? STAR : PLUS;
386 if (**re == '?') {
387 (*re)++;
388 op = op == STAR ? STARQ : PLUSQ;
389 }
390 quantifier(r, last_op, op);
391 break;
392 case '?':
393 quantifier(r, last_op, QUEST);
394 break;
395 case '|':
396 fixup_branch(r, fixup);
397 relocate(r, branch_start, 3);
398 r->code[branch_start] = BRANCH;
399 set_jump_offset(r, branch_start + 1, branch_start);
400 fixup = branch_start + 2;
401 r->code[fixup] = 0xff;
402 break;
403 default:
404 (*re)--;
405 last_op = r->code_size;
406 exact(r, re);
407 break;
408 }
409}
410
411int
412slre_compile(struct slre *r, const char *re)
413{
414 r->err_str = NULL;
415 r->code_size = r->data_size = r->num_caps = r->anchored = 0;
416
417 if (*re == '^')
418 r->anchored++;
419
420 emit(r, OPEN);
421 emit(r, 0);
422
423 while (*re != '\0')
424 compile(r, &re);
425
426 if (r->code[2] == BRANCH)
427 fixup_branch(r, 4);
428
429 emit(r, CLOSE);
430 emit(r, 0);
431 emit(r, END);
432
433 return (r->err_str == NULL ? 1 : 0);
434}
435
436static int match(const struct slre *, int,
437 const char *, int, int *, struct cap *);
438
439static void
440loop_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
441{
442 int saved_offset, matched_offset;
443
444 saved_offset = matched_offset = *ofs;
445
446 while (match(r, pc + 2, s, len, ofs, NULL)) {
447 saved_offset = *ofs;
448 if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
449 matched_offset = saved_offset;
450 *ofs = saved_offset;
451 }
452
453 *ofs = matched_offset;
454}
455
456static void
457loop_non_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
458{
459 int saved_offset = *ofs;
460
461 while (match(r, pc + 2, s, len, ofs, NULL)) {
462 saved_offset = *ofs;
463 if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
464 break;
465 }
466
467 *ofs = saved_offset;
468}
469
470static int
471is_any_of(const unsigned char *p, int len, const char *s, int *ofs)
472{
473 int i, ch;
474
475 ch = s[*ofs];
476
477 for (i = 0; i < len; i++)
478 if (p[i] == ch) {
479 (*ofs)++;
480 return 1;
481 }
482
483 return 0;
484}
485
486static int
487is_any_but(const unsigned char *p, int len, const char *s, int *ofs)
488{
489 int i, ch;
490
491 ch = s[*ofs];
492
493 for (i = 0; i < len; i++) {
494 if (p[i] == ch)
495 return 0;
496 }
497
498 (*ofs)++;
499 return 1;
500}
501
502static int
503match(const struct slre *r, int pc, const char *s, int len,
504 int *ofs, struct cap *caps)
505{
506 int n, saved_offset, res = 1;
507
508 while (res && r->code[pc] != END) {
509
510 assert(pc < r->code_size);
511 assert(pc < (int) (sizeof(r->code) / sizeof(r->code[0])));
512
513 switch (r->code[pc]) {
514 case BRANCH:
515 saved_offset = *ofs;
516 res = match(r, pc + 3, s, len, ofs, caps);
517 if (res == 0) {
518 *ofs = saved_offset;
519 res = match(r, pc + r->code[pc + 1],
520 s, len, ofs, caps);
521 }
522 pc += r->code[pc + 2];
523 break;
524 case EXACT:
525 res = 0;
526 n = r->code[pc + 2];
527 if (n <= len - *ofs && !memcmp(s + *ofs, r->data +
528 r->code[pc + 1], n)) {
529 (*ofs) += n;
530 res = 1;
531 }
532 pc += 3;
533 break;
534 case QUEST:
535 res = 1;
536 saved_offset = *ofs;
537 if (!match(r, pc + 2, s, len, ofs, caps))
538 *ofs = saved_offset;
539 pc += r->code[pc + 1];
540 break;
541 case STAR:
542 res = 1;
543 loop_greedy(r, pc, s, len, ofs);
544 pc += r->code[pc + 1];
545 break;
546 case STARQ:
547 res = 1;
548 loop_non_greedy(r, pc, s, len, ofs);
549 pc += r->code[pc + 1];
550 break;
551 case PLUS:
552 res = match(r, pc + 2, s, len, ofs, caps);
553 if (res == 0)
554 break;
555
556 loop_greedy(r, pc, s, len, ofs);
557 pc += r->code[pc + 1];
558 break;
559 case PLUSQ:
560 res = match(r, pc + 2, s, len, ofs, caps);
561 if (res == 0)
562 break;
563
564 loop_non_greedy(r, pc, s, len, ofs);
565 pc += r->code[pc + 1];
566 break;
567 case SPACE:
568 res = 0;
569 if (*ofs < len && isspace(((unsigned char *)s)[*ofs])) {
570 (*ofs)++;
571 res = 1;
572 }
573 pc++;
574 break;
575 case NONSPACE:
576 res = 0;
577 if (*ofs < len &&
578 !isspace(((unsigned char *)s)[*ofs])) {
579 (*ofs)++;
580 res = 1;
581 }
582 pc++;
583 break;
584 case DIGIT:
585 res = 0;
586 if (*ofs < len && isdigit(((unsigned char *)s)[*ofs])) {
587 (*ofs)++;
588 res = 1;
589 }
590 pc++;
591 break;
592 case ANY:
593 res = 0;
594 if (*ofs < len) {
595 (*ofs)++;
596 res = 1;
597 }
598 pc++;
599 break;
600 case ANYOF:
601 res = 0;
602 if (*ofs < len)
603 res = is_any_of(r->data + r->code[pc + 1],
604 r->code[pc + 2], s, ofs);
605 pc += 3;
606 break;
607 case ANYBUT:
608 res = 0;
609 if (*ofs < len)
610 res = is_any_but(r->data + r->code[pc + 1],
611 r->code[pc + 2], s, ofs);
612 pc += 3;
613 break;
614 case BOL:
615 res = *ofs == 0 ? 1 : 0;
616 pc++;
617 break;
618 case EOL:
619 res = *ofs == len ? 1 : 0;
620 pc++;
621 break;
622 case OPEN:
623 if (caps != NULL)
624 caps[r->code[pc + 1]].ptr = s + *ofs;
625 pc += 2;
626 break;
627 case CLOSE:
628 if (caps != NULL)
629 caps[r->code[pc + 1]].len = (s + *ofs) -
630 caps[r->code[pc + 1]].ptr;
631 pc += 2;
632 break;
633 case END:
634 pc++;
635 break;
636 default:
637 printf("unknown cmd (%d) at %d\n", r->code[pc], pc);
638 assert(0);
639 break;
640 }
641 }
642
643 return res;
644}
645
646int
647slre_match(const struct slre *r, const char *buf, int len,
648 struct cap *caps)
649{
650 int i, ofs = 0, res = 0;
651
652 if (r->anchored) {
653 res = match(r, 0, buf, len, &ofs, caps);
654 } else {
655 for (i = 0; i < len && res == 0; i++) {
656 ofs = i;
657 res = match(r, 0, buf, len, &ofs, caps);
658 }
659 }
660
661 return res;
662}
663
664#ifdef SLRE_TEST
665#define N_CAPS 5
666
667int main(int argc, char *argv[])
668{
669 struct slre slre;
670 struct cap caps[N_CAPS];
671 unsigned char data[1 * 1024 * 1024];
672 FILE *fp;
673 int i, res, len;
674
675 if (argc < 2) {
676 fprintf(stderr, "Usage: %s 'slre' <file>\n", argv[0]);
677 return 1;
678 }
679
680 fp = fopen(argv[2], "rb");
681 if (fp == NULL) {
682 fprintf(stderr, "Error: cannot open %s:%s\n",
683 argv[2], strerror(errno));
684 return 1;
685 }
686
687 if (!slre_compile(&slre, argv[1])) {
688 fprintf(stderr, "Error compiling slre: %s\n", slre.err_str);
689 return 1;
690 }
691
692 slre_dump(&slre, stderr);
693
694 while (fgets(data, sizeof(data), fp) != NULL) {
695 len = strlen(data);
696
697 if ((len > 0) && (data[len-1] == '\n')) {
698 data[len-1] = '\0';
699 --len;
700 }
701
702 printf("Data = \"%s\"\n", data);
703
704 (void) memset(caps, 0, sizeof(caps));
705
706 res = 0;
707
708 res = slre_match(&slre, data, len, caps);
709 printf("Result [%d]: %d\n", i, res);
710
711 for (i = 0; i < N_CAPS; i++) {
712 if (caps[i].len > 0) {
713 printf("Substring %d: len=%d [%.*s]\n", i,
714 caps[i].len,
715 caps[i].len, caps[i].ptr);
716 }
717 }
718 printf("----------------------------------------------------\n");
719 }
720 (void) fclose(fp);
721
722 return 0;
723}
724#endif
725