1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16#ifdef SLRE_TEST
17#include <stdio.h>
18#include <assert.h>
19#include <ctype.h>
20#include <stdlib.h>
21#include <string.h>
22#else
23#include <log.h>
24#include <common.h>
25#include <linux/ctype.h>
26#endif
27
28#include <errno.h>
29
30#include <slre.h>
31
32enum {END, BRANCH, ANY, EXACT, ANYOF, ANYBUT, OPEN, CLOSE, BOL, EOL,
33 STAR, PLUS, STARQ, PLUSQ, QUEST, SPACE, NONSPACE, DIGIT};
34
35#ifdef SLRE_TEST
36static struct {
37 const char *name;
38 int narg;
39 const char *flags;
40} opcodes[] = {
41 {"END", 0, ""},
42 {"BRANCH", 2, "oo"},
43 {"ANY", 0, ""},
44 {"EXACT", 2, "d"},
45 {"ANYOF", 2, "D"},
46 {"ANYBUT", 2, "D"},
47 {"OPEN ", 1, "i"},
48 {"CLOSE", 1, "i"},
49 {"BOL", 0, ""},
50 {"EOL", 0, ""},
51 {"STAR", 1, "o"},
52 {"PLUS", 1, "o"},
53 {"STARQ", 1, "o"},
54 {"PLUSQ", 1, "o"},
55 {"QUEST", 1, "o"},
56 {"SPACE", 0, ""},
57 {"NONSPACE", 0, ""},
58 {"DIGIT", 0, ""}
59};
60#endif
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93static const char *meta_chars = "|.^$*+?()[\\";
94
95#ifdef SLRE_TEST
96
97static void
98print_character_set(FILE *fp, const unsigned char *p, int len)
99{
100 int i;
101
102 for (i = 0; i < len; i++) {
103 if (i > 0)
104 (void) fputc(',', fp);
105 if (p[i] == 0) {
106 i++;
107 if (p[i] == 0)
108 (void) fprintf(fp, "\\x%02x", p[i]);
109 else
110 (void) fprintf(fp, "%s", opcodes[p[i]].name);
111 } else if (isprint(p[i])) {
112 (void) fputc(p[i], fp);
113 } else {
114 (void) fprintf(fp, "\\x%02x", p[i]);
115 }
116 }
117}
118
119void
120slre_dump(const struct slre *r, FILE *fp)
121{
122 int i, j, ch, op, pc;
123
124 for (pc = 0; pc < r->code_size; pc++) {
125
126 op = r->code[pc];
127 (void) fprintf(fp, "%3d %s ", pc, opcodes[op].name);
128
129 for (i = 0; opcodes[op].flags[i] != '\0'; i++)
130 switch (opcodes[op].flags[i]) {
131 case 'i':
132 (void) fprintf(fp, "%d ", r->code[pc + 1]);
133 pc++;
134 break;
135 case 'o':
136 (void) fprintf(fp, "%d ",
137 pc + r->code[pc + 1] - i);
138 pc++;
139 break;
140 case 'D':
141 print_character_set(fp, r->data +
142 r->code[pc + 1], r->code[pc + 2]);
143 pc += 2;
144 break;
145 case 'd':
146 (void) fputc('"', fp);
147 for (j = 0; j < r->code[pc + 2]; j++) {
148 ch = r->data[r->code[pc + 1] + j];
149 if (isprint(ch)) {
150 (void) fputc(ch, fp);
151 } else {
152 (void) fprintf(fp,
153 "\\x%02x", ch);
154 }
155 }
156 (void) fputc('"', fp);
157 pc += 2;
158 break;
159 }
160
161 (void) fputc('\n', fp);
162 }
163}
164#endif
165
166static void
167set_jump_offset(struct slre *r, int pc, int offset)
168{
169 assert(offset < r->code_size);
170
171 if (r->code_size - offset > 0xff)
172 r->err_str = "Jump offset is too big";
173 else
174 r->code[pc] = (unsigned char) (r->code_size - offset);
175}
176
177static void
178emit(struct slre *r, int code)
179{
180 if (r->code_size >= (int) (sizeof(r->code) / sizeof(r->code[0])))
181 r->err_str = "RE is too long (code overflow)";
182 else
183 r->code[r->code_size++] = (unsigned char) code;
184}
185
186static void
187store_char_in_data(struct slre *r, int ch)
188{
189 if (r->data_size >= (int) sizeof(r->data))
190 r->err_str = "RE is too long (data overflow)";
191 else
192 r->data[r->data_size++] = ch;
193}
194
195static void
196exact(struct slre *r, const char **re)
197{
198 int old_data_size = r->data_size;
199
200 while (**re != '\0' && (strchr(meta_chars, **re)) == NULL)
201 store_char_in_data(r, *(*re)++);
202
203 emit(r, EXACT);
204 emit(r, old_data_size);
205 emit(r, r->data_size - old_data_size);
206}
207
208static int
209get_escape_char(const char **re)
210{
211 int res;
212
213 switch (*(*re)++) {
214 case 'n':
215 res = '\n';
216 break;
217 case 'r':
218 res = '\r';
219 break;
220 case 't':
221 res = '\t';
222 break;
223 case '0':
224 res = 0;
225 break;
226 case 'S':
227 res = NONSPACE << 8;
228 break;
229 case 's':
230 res = SPACE << 8;
231 break;
232 case 'd':
233 res = DIGIT << 8;
234 break;
235 default:
236 res = (*re)[-1];
237 break;
238 }
239
240 return res;
241}
242
243static void
244anyof(struct slre *r, const char **re)
245{
246 int esc, old_data_size = r->data_size, op = ANYOF;
247
248 if (**re == '^') {
249 op = ANYBUT;
250 (*re)++;
251 }
252
253 while (**re != '\0')
254
255 switch (*(*re)++) {
256 case ']':
257 emit(r, op);
258 emit(r, old_data_size);
259 emit(r, r->data_size - old_data_size);
260 return;
261
262 break;
263 case '\\':
264 esc = get_escape_char(re);
265 if ((esc & 0xff) == 0) {
266 store_char_in_data(r, 0);
267 store_char_in_data(r, esc >> 8);
268 } else {
269 store_char_in_data(r, esc);
270 }
271 break;
272 default:
273 store_char_in_data(r, (*re)[-1]);
274 break;
275 }
276
277 r->err_str = "No closing ']' bracket";
278}
279
280static void
281relocate(struct slre *r, int begin, int shift)
282{
283 emit(r, END);
284 memmove(r->code + begin + shift, r->code + begin, r->code_size - begin);
285 r->code_size += shift;
286}
287
288static void
289quantifier(struct slre *r, int prev, int op)
290{
291 if (r->code[prev] == EXACT && r->code[prev + 2] > 1) {
292 r->code[prev + 2]--;
293 emit(r, EXACT);
294 emit(r, r->code[prev + 1] + r->code[prev + 2]);
295 emit(r, 1);
296 prev = r->code_size - 3;
297 }
298 relocate(r, prev, 2);
299 r->code[prev] = op;
300 set_jump_offset(r, prev + 1, prev);
301}
302
303static void
304exact_one_char(struct slre *r, int ch)
305{
306 emit(r, EXACT);
307 emit(r, r->data_size);
308 emit(r, 1);
309 store_char_in_data(r, ch);
310}
311
312static void
313fixup_branch(struct slre *r, int fixup)
314{
315 if (fixup > 0) {
316 emit(r, END);
317 set_jump_offset(r, fixup, fixup - 2);
318 }
319}
320
321static void
322compile(struct slre *r, const char **re)
323{
324 int op, esc, branch_start, last_op, fixup, cap_no, level;
325
326 fixup = 0;
327 level = r->num_caps;
328 branch_start = last_op = r->code_size;
329
330 for (;;)
331 switch (*(*re)++) {
332 case '\0':
333 (*re)--;
334 return;
335
336 break;
337 case '^':
338 emit(r, BOL);
339 break;
340 case '$':
341 emit(r, EOL);
342 break;
343 case '.':
344 last_op = r->code_size;
345 emit(r, ANY);
346 break;
347 case '[':
348 last_op = r->code_size;
349 anyof(r, re);
350 break;
351 case '\\':
352 last_op = r->code_size;
353 esc = get_escape_char(re);
354 if (esc & 0xff00)
355 emit(r, esc >> 8);
356 else
357 exact_one_char(r, esc);
358 break;
359 case '(':
360 last_op = r->code_size;
361 cap_no = ++r->num_caps;
362 emit(r, OPEN);
363 emit(r, cap_no);
364
365 compile(r, re);
366 if (*(*re)++ != ')') {
367 r->err_str = "No closing bracket";
368 return;
369 }
370
371 emit(r, CLOSE);
372 emit(r, cap_no);
373 break;
374 case ')':
375 (*re)--;
376 fixup_branch(r, fixup);
377 if (level == 0) {
378 r->err_str = "Unbalanced brackets";
379 return;
380 }
381 return;
382
383 break;
384 case '+':
385 case '*':
386 op = (*re)[-1] == '*' ? STAR : PLUS;
387 if (**re == '?') {
388 (*re)++;
389 op = op == STAR ? STARQ : PLUSQ;
390 }
391 quantifier(r, last_op, op);
392 break;
393 case '?':
394 quantifier(r, last_op, QUEST);
395 break;
396 case '|':
397 fixup_branch(r, fixup);
398 relocate(r, branch_start, 3);
399 r->code[branch_start] = BRANCH;
400 set_jump_offset(r, branch_start + 1, branch_start);
401 fixup = branch_start + 2;
402 r->code[fixup] = 0xff;
403 break;
404 default:
405 (*re)--;
406 last_op = r->code_size;
407 exact(r, re);
408 break;
409 }
410}
411
412int
413slre_compile(struct slre *r, const char *re)
414{
415 r->err_str = NULL;
416 r->code_size = r->data_size = r->num_caps = r->anchored = 0;
417
418 if (*re == '^')
419 r->anchored++;
420
421 emit(r, OPEN);
422 emit(r, 0);
423
424 while (*re != '\0')
425 compile(r, &re);
426
427 if (r->code[2] == BRANCH)
428 fixup_branch(r, 4);
429
430 emit(r, CLOSE);
431 emit(r, 0);
432 emit(r, END);
433
434 return (r->err_str == NULL ? 1 : 0);
435}
436
437static int match(const struct slre *, int,
438 const char *, int, int *, struct cap *);
439
440static void
441loop_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
442{
443 int saved_offset, matched_offset;
444
445 matched_offset = *ofs;
446
447 while (match(r, pc + 2, s, len, ofs, NULL)) {
448 saved_offset = *ofs;
449 if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
450 matched_offset = saved_offset;
451 *ofs = saved_offset;
452 }
453
454 *ofs = matched_offset;
455}
456
457static void
458loop_non_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
459{
460 int saved_offset = *ofs;
461
462 while (match(r, pc + 2, s, len, ofs, NULL)) {
463 saved_offset = *ofs;
464 if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
465 break;
466 }
467
468 *ofs = saved_offset;
469}
470
471static int
472is_any_of(const unsigned char *p, int len, const char *s, int *ofs)
473{
474 int i, ch;
475
476 ch = s[*ofs];
477
478 for (i = 0; i < len; i++)
479 if (p[i] == ch) {
480 (*ofs)++;
481 return 1;
482 }
483
484 return 0;
485}
486
487static int
488is_any_but(const unsigned char *p, int len, const char *s, int *ofs)
489{
490 int i, ch;
491
492 ch = s[*ofs];
493
494 for (i = 0; i < len; i++) {
495 if (p[i] == ch)
496 return 0;
497 }
498
499 (*ofs)++;
500 return 1;
501}
502
503static int
504match(const struct slre *r, int pc, const char *s, int len,
505 int *ofs, struct cap *caps)
506{
507 int n, saved_offset, res = 1;
508
509 while (res && r->code[pc] != END) {
510
511 assert(pc < r->code_size);
512 assert(pc < (int) (sizeof(r->code) / sizeof(r->code[0])));
513
514 switch (r->code[pc]) {
515 case BRANCH:
516 saved_offset = *ofs;
517 res = match(r, pc + 3, s, len, ofs, caps);
518 if (res == 0) {
519 *ofs = saved_offset;
520 res = match(r, pc + r->code[pc + 1],
521 s, len, ofs, caps);
522 }
523 pc += r->code[pc + 2];
524 break;
525 case EXACT:
526 res = 0;
527 n = r->code[pc + 2];
528 if (n <= len - *ofs && !memcmp(s + *ofs, r->data +
529 r->code[pc + 1], n)) {
530 (*ofs) += n;
531 res = 1;
532 }
533 pc += 3;
534 break;
535 case QUEST:
536 res = 1;
537 saved_offset = *ofs;
538 if (!match(r, pc + 2, s, len, ofs, caps))
539 *ofs = saved_offset;
540 pc += r->code[pc + 1];
541 break;
542 case STAR:
543 res = 1;
544 loop_greedy(r, pc, s, len, ofs);
545 pc += r->code[pc + 1];
546 break;
547 case STARQ:
548 res = 1;
549 loop_non_greedy(r, pc, s, len, ofs);
550 pc += r->code[pc + 1];
551 break;
552 case PLUS:
553 res = match(r, pc + 2, s, len, ofs, caps);
554 if (res == 0)
555 break;
556
557 loop_greedy(r, pc, s, len, ofs);
558 pc += r->code[pc + 1];
559 break;
560 case PLUSQ:
561 res = match(r, pc + 2, s, len, ofs, caps);
562 if (res == 0)
563 break;
564
565 loop_non_greedy(r, pc, s, len, ofs);
566 pc += r->code[pc + 1];
567 break;
568 case SPACE:
569 res = 0;
570 if (*ofs < len && isspace(((unsigned char *)s)[*ofs])) {
571 (*ofs)++;
572 res = 1;
573 }
574 pc++;
575 break;
576 case NONSPACE:
577 res = 0;
578 if (*ofs < len &&
579 !isspace(((unsigned char *)s)[*ofs])) {
580 (*ofs)++;
581 res = 1;
582 }
583 pc++;
584 break;
585 case DIGIT:
586 res = 0;
587 if (*ofs < len && isdigit(((unsigned char *)s)[*ofs])) {
588 (*ofs)++;
589 res = 1;
590 }
591 pc++;
592 break;
593 case ANY:
594 res = 0;
595 if (*ofs < len) {
596 (*ofs)++;
597 res = 1;
598 }
599 pc++;
600 break;
601 case ANYOF:
602 res = 0;
603 if (*ofs < len)
604 res = is_any_of(r->data + r->code[pc + 1],
605 r->code[pc + 2], s, ofs);
606 pc += 3;
607 break;
608 case ANYBUT:
609 res = 0;
610 if (*ofs < len)
611 res = is_any_but(r->data + r->code[pc + 1],
612 r->code[pc + 2], s, ofs);
613 pc += 3;
614 break;
615 case BOL:
616 res = *ofs == 0 ? 1 : 0;
617 pc++;
618 break;
619 case EOL:
620 res = *ofs == len ? 1 : 0;
621 pc++;
622 break;
623 case OPEN:
624 if (caps != NULL)
625 caps[r->code[pc + 1]].ptr = s + *ofs;
626 pc += 2;
627 break;
628 case CLOSE:
629 if (caps != NULL)
630 caps[r->code[pc + 1]].len = (s + *ofs) -
631 caps[r->code[pc + 1]].ptr;
632 pc += 2;
633 break;
634 case END:
635 pc++;
636 break;
637 default:
638 printf("unknown cmd (%d) at %d\n", r->code[pc], pc);
639 assert(0);
640 break;
641 }
642 }
643
644 return res;
645}
646
647int
648slre_match(const struct slre *r, const char *buf, int len,
649 struct cap *caps)
650{
651 int i, ofs = 0, res = 0;
652
653 if (r->anchored) {
654 res = match(r, 0, buf, len, &ofs, caps);
655 } else {
656 for (i = 0; i < len && res == 0; i++) {
657 ofs = i;
658 res = match(r, 0, buf, len, &ofs, caps);
659 }
660 }
661
662 return res;
663}
664
665#ifdef SLRE_TEST
666#define N_CAPS 5
667
668int main(int argc, char *argv[])
669{
670 struct slre slre;
671 struct cap caps[N_CAPS];
672 unsigned char data[1 * 1024 * 1024];
673 FILE *fp;
674 int i, res, len;
675
676 if (argc < 2) {
677 fprintf(stderr, "Usage: %s 'slre' <file>\n", argv[0]);
678 return 1;
679 }
680
681 fp = fopen(argv[2], "rb");
682 if (fp == NULL) {
683 fprintf(stderr, "Error: cannot open %s:%s\n",
684 argv[2], strerror(errno));
685 return 1;
686 }
687
688 if (!slre_compile(&slre, argv[1])) {
689 fprintf(stderr, "Error compiling slre: %s\n", slre.err_str);
690 return 1;
691 }
692
693 slre_dump(&slre, stderr);
694
695 while (fgets(data, sizeof(data), fp) != NULL) {
696 len = strlen(data);
697
698 if ((len > 0) && (data[len-1] == '\n')) {
699 data[len-1] = '\0';
700 --len;
701 }
702
703 printf("Data = \"%s\"\n", data);
704
705 (void) memset(caps, 0, sizeof(caps));
706
707 res = slre_match(&slre, data, len, caps);
708 printf("Result [%d]: %d\n", i, res);
709
710 for (i = 0; i < N_CAPS; i++) {
711 if (caps[i].len > 0) {
712 printf("Substring %d: len=%d [%.*s]\n", i,
713 caps[i].len,
714 caps[i].len, caps[i].ptr);
715 }
716 }
717 printf("----------------------------------------------------\n");
718 }
719 (void) fclose(fp);
720
721 return 0;
722}
723#endif
724