uboot/lib/slre.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2004-2005 Sergey Lyubka <valenok@gmail.com>
   3 * All rights reserved
   4 *
   5 * "THE BEER-WARE LICENSE" (Revision 42):
   6 * Sergey Lyubka wrote this file.  As long as you retain this notice you
   7 * can do whatever you want with this stuff. If we meet some day, and you think
   8 * this stuff is worth it, you can buy me a beer in return.
   9 */
  10
  11/*
  12 * Downloaded Sat Nov  5 17:43:06 CET 2011 at
  13 * http://slre.sourceforge.net/1.0/slre.c
  14 */
  15
  16#ifdef SLRE_TEST
  17#include <stdio.h>
  18#include <assert.h>
  19#include <ctype.h>
  20#include <stdlib.h>
  21#include <string.h>
  22#else
  23#include <log.h>
  24#include <common.h>
  25#include <linux/ctype.h>
  26#endif /* SLRE_TEST */
  27
  28#include <errno.h>
  29
  30#include <slre.h>
  31
  32enum {END, BRANCH, ANY, EXACT, ANYOF, ANYBUT, OPEN, CLOSE, BOL, EOL,
  33        STAR, PLUS, STARQ, PLUSQ, QUEST, SPACE, NONSPACE, DIGIT};
  34
  35#ifdef SLRE_TEST
  36static struct {
  37        const char      *name;
  38        int             narg;
  39        const char      *flags;
  40} opcodes[] = {
  41        {"END",         0, ""},         /* End of code block or program */
  42        {"BRANCH",      2, "oo"},       /* Alternative operator, "|"    */
  43        {"ANY",         0, ""},         /* Match any character, "."     */
  44        {"EXACT",       2, "d"},        /* Match exact string           */
  45        {"ANYOF",       2, "D"},        /* Match any from set, "[]"     */
  46        {"ANYBUT",      2, "D"},        /* Match any but from set, "[^]"*/
  47        {"OPEN ",       1, "i"},        /* Capture start, "("           */
  48        {"CLOSE",       1, "i"},        /* Capture end, ")"             */
  49        {"BOL",         0, ""},         /* Beginning of string, "^"     */
  50        {"EOL",         0, ""},         /* End of string, "$"           */
  51        {"STAR",        1, "o"},        /* Match zero or more times "*" */
  52        {"PLUS",        1, "o"},        /* Match one or more times, "+" */
  53        {"STARQ",       1, "o"},        /* Non-greedy STAR,  "*?"       */
  54        {"PLUSQ",       1, "o"},        /* Non-greedy PLUS, "+?"        */
  55        {"QUEST",       1, "o"},        /* Match zero or one time, "?"  */
  56        {"SPACE",       0, ""},         /* Match whitespace, "\s"       */
  57        {"NONSPACE",    0, ""},         /* Match non-space, "\S"        */
  58        {"DIGIT",       0, ""}          /* Match digit, "\d"            */
  59};
  60#endif /* SLRE_TEST */
  61
  62/*
  63 * Commands and operands are all unsigned char (1 byte long). All code offsets
  64 * are relative to current address, and positive (always point forward). Data
  65 * offsets are absolute. Commands with operands:
  66 *
  67 * BRANCH offset1 offset2
  68 *      Try to match the code block that follows the BRANCH instruction
  69 *      (code block ends with END). If no match, try to match code block that
  70 *      starts at offset1. If either of these match, jump to offset2.
  71 *
  72 * EXACT data_offset data_length
  73 *      Try to match exact string. String is recorded in data section from
  74 *      data_offset, and has length data_length.
  75 *
  76 * OPEN capture_number
  77 * CLOSE capture_number
  78 *      If the user have passed 'struct cap' array for captures, OPEN
  79 *      records the beginning of the matched substring (cap->ptr), CLOSE
  80 *      sets the length (cap->len) for respective capture_number.
  81 *
  82 * STAR code_offset
  83 * PLUS code_offset
  84 * QUEST code_offset
  85 *      *, +, ?, respectively. Try to gobble as much as possible from the
  86 *      matched buffer, until code block that follows these instructions
  87 *      matches. When the longest possible string is matched,
  88 *      jump to code_offset
  89 *
  90 * STARQ, PLUSQ are non-greedy versions of STAR and PLUS.
  91 */
  92
  93static const char *meta_chars = "|.^$*+?()[\\";
  94
  95#ifdef SLRE_TEST
  96
  97static void
  98print_character_set(FILE *fp, const unsigned char *p, int len)
  99{
 100        int     i;
 101
 102        for (i = 0; i < len; i++) {
 103                if (i > 0)
 104                        (void) fputc(',', fp);
 105                if (p[i] == 0) {
 106                        i++;
 107                        if (p[i] == 0)
 108                                (void) fprintf(fp, "\\x%02x", p[i]);
 109                        else
 110                                (void) fprintf(fp, "%s", opcodes[p[i]].name);
 111                } else if (isprint(p[i])) {
 112                        (void) fputc(p[i], fp);
 113                } else {
 114                        (void) fprintf(fp, "\\x%02x", p[i]);
 115                }
 116        }
 117}
 118
 119void
 120slre_dump(const struct slre *r, FILE *fp)
 121{
 122        int     i, j, ch, op, pc;
 123
 124        for (pc = 0; pc < r->code_size; pc++) {
 125
 126                op = r->code[pc];
 127                (void) fprintf(fp, "%3d %s ", pc, opcodes[op].name);
 128
 129                for (i = 0; opcodes[op].flags[i] != '\0'; i++)
 130                        switch (opcodes[op].flags[i]) {
 131                        case 'i':
 132                                (void) fprintf(fp, "%d ", r->code[pc + 1]);
 133                                pc++;
 134                                break;
 135                        case 'o':
 136                                (void) fprintf(fp, "%d ",
 137                                    pc + r->code[pc + 1] - i);
 138                                pc++;
 139                                break;
 140                        case 'D':
 141                                print_character_set(fp, r->data +
 142                                    r->code[pc + 1], r->code[pc + 2]);
 143                                pc += 2;
 144                                break;
 145                        case 'd':
 146                                (void) fputc('"', fp);
 147                                for (j = 0; j < r->code[pc + 2]; j++) {
 148                                        ch = r->data[r->code[pc + 1] + j];
 149                                        if (isprint(ch)) {
 150                                                (void) fputc(ch, fp);
 151                                        } else {
 152                                                (void) fprintf(fp,
 153                                                        "\\x%02x", ch);
 154                                        }
 155                                }
 156                                (void) fputc('"', fp);
 157                                pc += 2;
 158                                break;
 159                        }
 160
 161                (void) fputc('\n', fp);
 162        }
 163}
 164#endif /* SLRE_TEST */
 165
 166static void
 167set_jump_offset(struct slre *r, int pc, int offset)
 168{
 169        assert(offset < r->code_size);
 170
 171        if (r->code_size - offset > 0xff)
 172                r->err_str = "Jump offset is too big";
 173        else
 174                r->code[pc] = (unsigned char) (r->code_size - offset);
 175}
 176
 177static void
 178emit(struct slre *r, int code)
 179{
 180        if (r->code_size >= (int) (sizeof(r->code) / sizeof(r->code[0])))
 181                r->err_str = "RE is too long (code overflow)";
 182        else
 183                r->code[r->code_size++] = (unsigned char) code;
 184}
 185
 186static void
 187store_char_in_data(struct slre *r, int ch)
 188{
 189        if (r->data_size >= (int) sizeof(r->data))
 190                r->err_str = "RE is too long (data overflow)";
 191        else
 192                r->data[r->data_size++] = ch;
 193}
 194
 195static void
 196exact(struct slre *r, const char **re)
 197{
 198        int     old_data_size = r->data_size;
 199
 200        while (**re != '\0' && (strchr(meta_chars, **re)) == NULL)
 201                store_char_in_data(r, *(*re)++);
 202
 203        emit(r, EXACT);
 204        emit(r, old_data_size);
 205        emit(r, r->data_size - old_data_size);
 206}
 207
 208static int
 209get_escape_char(const char **re)
 210{
 211        int     res;
 212
 213        switch (*(*re)++) {
 214        case 'n':
 215                res = '\n';
 216                break;
 217        case 'r':
 218                res = '\r';
 219                break;
 220        case 't':
 221                res = '\t';
 222                break;
 223        case '0':
 224                res = 0;
 225                break;
 226        case 'S':
 227                res = NONSPACE << 8;
 228                break;
 229        case 's':
 230                res = SPACE << 8;
 231                break;
 232        case 'd':
 233                res = DIGIT << 8;
 234                break;
 235        default:
 236                res = (*re)[-1];
 237                break;
 238        }
 239
 240        return res;
 241}
 242
 243static void
 244anyof(struct slre *r, const char **re)
 245{
 246        int     esc, old_data_size = r->data_size, op = ANYOF;
 247
 248        if (**re == '^') {
 249                op = ANYBUT;
 250                (*re)++;
 251        }
 252
 253        while (**re != '\0')
 254
 255                switch (*(*re)++) {
 256                case ']':
 257                        emit(r, op);
 258                        emit(r, old_data_size);
 259                        emit(r, r->data_size - old_data_size);
 260                        return;
 261                        /* NOTREACHED */
 262                        break;
 263                case '\\':
 264                        esc = get_escape_char(re);
 265                        if ((esc & 0xff) == 0) {
 266                                store_char_in_data(r, 0);
 267                                store_char_in_data(r, esc >> 8);
 268                        } else {
 269                                store_char_in_data(r, esc);
 270                        }
 271                        break;
 272                default:
 273                        store_char_in_data(r, (*re)[-1]);
 274                        break;
 275                }
 276
 277        r->err_str = "No closing ']' bracket";
 278}
 279
 280static void
 281relocate(struct slre *r, int begin, int shift)
 282{
 283        emit(r, END);
 284        memmove(r->code + begin + shift, r->code + begin, r->code_size - begin);
 285        r->code_size += shift;
 286}
 287
 288static void
 289quantifier(struct slre *r, int prev, int op)
 290{
 291        if (r->code[prev] == EXACT && r->code[prev + 2] > 1) {
 292                r->code[prev + 2]--;
 293                emit(r, EXACT);
 294                emit(r, r->code[prev + 1] + r->code[prev + 2]);
 295                emit(r, 1);
 296                prev = r->code_size - 3;
 297        }
 298        relocate(r, prev, 2);
 299        r->code[prev] = op;
 300        set_jump_offset(r, prev + 1, prev);
 301}
 302
 303static void
 304exact_one_char(struct slre *r, int ch)
 305{
 306        emit(r, EXACT);
 307        emit(r, r->data_size);
 308        emit(r, 1);
 309        store_char_in_data(r, ch);
 310}
 311
 312static void
 313fixup_branch(struct slre *r, int fixup)
 314{
 315        if (fixup > 0) {
 316                emit(r, END);
 317                set_jump_offset(r, fixup, fixup - 2);
 318        }
 319}
 320
 321static void
 322compile(struct slre *r, const char **re)
 323{
 324        int     op, esc, branch_start, last_op, fixup, cap_no, level;
 325
 326        fixup = 0;
 327        level = r->num_caps;
 328        branch_start = last_op = r->code_size;
 329
 330        for (;;)
 331                switch (*(*re)++) {
 332                case '\0':
 333                        (*re)--;
 334                        return;
 335                        /* NOTREACHED */
 336                        break;
 337                case '^':
 338                        emit(r, BOL);
 339                        break;
 340                case '$':
 341                        emit(r, EOL);
 342                        break;
 343                case '.':
 344                        last_op = r->code_size;
 345                        emit(r, ANY);
 346                        break;
 347                case '[':
 348                        last_op = r->code_size;
 349                        anyof(r, re);
 350                        break;
 351                case '\\':
 352                        last_op = r->code_size;
 353                        esc = get_escape_char(re);
 354                        if (esc & 0xff00)
 355                                emit(r, esc >> 8);
 356                        else
 357                                exact_one_char(r, esc);
 358                        break;
 359                case '(':
 360                        last_op = r->code_size;
 361                        cap_no = ++r->num_caps;
 362                        emit(r, OPEN);
 363                        emit(r, cap_no);
 364
 365                        compile(r, re);
 366                        if (*(*re)++ != ')') {
 367                                r->err_str = "No closing bracket";
 368                                return;
 369                        }
 370
 371                        emit(r, CLOSE);
 372                        emit(r, cap_no);
 373                        break;
 374                case ')':
 375                        (*re)--;
 376                        fixup_branch(r, fixup);
 377                        if (level == 0) {
 378                                r->err_str = "Unbalanced brackets";
 379                                return;
 380                        }
 381                        return;
 382                        /* NOTREACHED */
 383                        break;
 384                case '+':
 385                case '*':
 386                        op = (*re)[-1] == '*' ? STAR : PLUS;
 387                        if (**re == '?') {
 388                                (*re)++;
 389                                op = op == STAR ? STARQ : PLUSQ;
 390                        }
 391                        quantifier(r, last_op, op);
 392                        break;
 393                case '?':
 394                        quantifier(r, last_op, QUEST);
 395                        break;
 396                case '|':
 397                        fixup_branch(r, fixup);
 398                        relocate(r, branch_start, 3);
 399                        r->code[branch_start] = BRANCH;
 400                        set_jump_offset(r, branch_start + 1, branch_start);
 401                        fixup = branch_start + 2;
 402                        r->code[fixup] = 0xff;
 403                        break;
 404                default:
 405                        (*re)--;
 406                        last_op = r->code_size;
 407                        exact(r, re);
 408                        break;
 409                }
 410}
 411
 412int
 413slre_compile(struct slre *r, const char *re)
 414{
 415        r->err_str = NULL;
 416        r->code_size = r->data_size = r->num_caps = r->anchored = 0;
 417
 418        if (*re == '^')
 419                r->anchored++;
 420
 421        emit(r, OPEN);  /* This will capture what matches full RE */
 422        emit(r, 0);
 423
 424        while (*re != '\0')
 425                compile(r, &re);
 426
 427        if (r->code[2] == BRANCH)
 428                fixup_branch(r, 4);
 429
 430        emit(r, CLOSE);
 431        emit(r, 0);
 432        emit(r, END);
 433
 434        return (r->err_str == NULL ? 1 : 0);
 435}
 436
 437static int match(const struct slre *, int,
 438                const char *, int, int *, struct cap *);
 439
 440static void
 441loop_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
 442{
 443        int     saved_offset, matched_offset;
 444
 445        matched_offset = *ofs;
 446
 447        while (match(r, pc + 2, s, len, ofs, NULL)) {
 448                saved_offset = *ofs;
 449                if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
 450                        matched_offset = saved_offset;
 451                *ofs = saved_offset;
 452        }
 453
 454        *ofs = matched_offset;
 455}
 456
 457static void
 458loop_non_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
 459{
 460        int     saved_offset = *ofs;
 461
 462        while (match(r, pc + 2, s, len, ofs, NULL)) {
 463                saved_offset = *ofs;
 464                if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
 465                        break;
 466        }
 467
 468        *ofs = saved_offset;
 469}
 470
 471static int
 472is_any_of(const unsigned char *p, int len, const char *s, int *ofs)
 473{
 474        int     i, ch;
 475
 476        ch = s[*ofs];
 477
 478        for (i = 0; i < len; i++)
 479                if (p[i] == ch) {
 480                        (*ofs)++;
 481                        return 1;
 482                }
 483
 484        return 0;
 485}
 486
 487static int
 488is_any_but(const unsigned char *p, int len, const char *s, int *ofs)
 489{
 490        int     i, ch;
 491
 492        ch = s[*ofs];
 493
 494        for (i = 0; i < len; i++) {
 495                if (p[i] == ch)
 496                        return 0;
 497        }
 498
 499        (*ofs)++;
 500        return 1;
 501}
 502
 503static int
 504match(const struct slre *r, int pc, const char *s, int len,
 505                int *ofs, struct cap *caps)
 506{
 507        int     n, saved_offset, res = 1;
 508
 509        while (res && r->code[pc] != END) {
 510
 511                assert(pc < r->code_size);
 512                assert(pc < (int) (sizeof(r->code) / sizeof(r->code[0])));
 513
 514                switch (r->code[pc]) {
 515                case BRANCH:
 516                        saved_offset = *ofs;
 517                        res = match(r, pc + 3, s, len, ofs, caps);
 518                        if (res == 0) {
 519                                *ofs = saved_offset;
 520                                res = match(r, pc + r->code[pc + 1],
 521                                    s, len, ofs, caps);
 522                        }
 523                        pc += r->code[pc + 2];
 524                        break;
 525                case EXACT:
 526                        res = 0;
 527                        n = r->code[pc + 2];    /* String length */
 528                        if (n <= len - *ofs && !memcmp(s + *ofs, r->data +
 529                            r->code[pc + 1], n)) {
 530                                (*ofs) += n;
 531                                res = 1;
 532                        }
 533                        pc += 3;
 534                        break;
 535                case QUEST:
 536                        res = 1;
 537                        saved_offset = *ofs;
 538                        if (!match(r, pc + 2, s, len, ofs, caps))
 539                                *ofs = saved_offset;
 540                        pc += r->code[pc + 1];
 541                        break;
 542                case STAR:
 543                        res = 1;
 544                        loop_greedy(r, pc, s, len, ofs);
 545                        pc += r->code[pc + 1];
 546                        break;
 547                case STARQ:
 548                        res = 1;
 549                        loop_non_greedy(r, pc, s, len, ofs);
 550                        pc += r->code[pc + 1];
 551                        break;
 552                case PLUS:
 553                        res = match(r, pc + 2, s, len, ofs, caps);
 554                        if (res == 0)
 555                                break;
 556
 557                        loop_greedy(r, pc, s, len, ofs);
 558                        pc += r->code[pc + 1];
 559                        break;
 560                case PLUSQ:
 561                        res = match(r, pc + 2, s, len, ofs, caps);
 562                        if (res == 0)
 563                                break;
 564
 565                        loop_non_greedy(r, pc, s, len, ofs);
 566                        pc += r->code[pc + 1];
 567                        break;
 568                case SPACE:
 569                        res = 0;
 570                        if (*ofs < len && isspace(((unsigned char *)s)[*ofs])) {
 571                                (*ofs)++;
 572                                res = 1;
 573                        }
 574                        pc++;
 575                        break;
 576                case NONSPACE:
 577                        res = 0;
 578                        if (*ofs < len &&
 579                                        !isspace(((unsigned char *)s)[*ofs])) {
 580                                (*ofs)++;
 581                                res = 1;
 582                        }
 583                        pc++;
 584                        break;
 585                case DIGIT:
 586                        res = 0;
 587                        if (*ofs < len && isdigit(((unsigned char *)s)[*ofs])) {
 588                                (*ofs)++;
 589                                res = 1;
 590                        }
 591                        pc++;
 592                        break;
 593                case ANY:
 594                        res = 0;
 595                        if (*ofs < len) {
 596                                (*ofs)++;
 597                                res = 1;
 598                        }
 599                        pc++;
 600                        break;
 601                case ANYOF:
 602                        res = 0;
 603                        if (*ofs < len)
 604                                res = is_any_of(r->data + r->code[pc + 1],
 605                                        r->code[pc + 2], s, ofs);
 606                        pc += 3;
 607                        break;
 608                case ANYBUT:
 609                        res = 0;
 610                        if (*ofs < len)
 611                                res = is_any_but(r->data + r->code[pc + 1],
 612                                        r->code[pc + 2], s, ofs);
 613                        pc += 3;
 614                        break;
 615                case BOL:
 616                        res = *ofs == 0 ? 1 : 0;
 617                        pc++;
 618                        break;
 619                case EOL:
 620                        res = *ofs == len ? 1 : 0;
 621                        pc++;
 622                        break;
 623                case OPEN:
 624                        if (caps != NULL)
 625                                caps[r->code[pc + 1]].ptr = s + *ofs;
 626                        pc += 2;
 627                        break;
 628                case CLOSE:
 629                        if (caps != NULL)
 630                                caps[r->code[pc + 1]].len = (s + *ofs) -
 631                                    caps[r->code[pc + 1]].ptr;
 632                        pc += 2;
 633                        break;
 634                case END:
 635                        pc++;
 636                        break;
 637                default:
 638                        printf("unknown cmd (%d) at %d\n", r->code[pc], pc);
 639                        assert(0);
 640                        break;
 641                }
 642        }
 643
 644        return res;
 645}
 646
 647int
 648slre_match(const struct slre *r, const char *buf, int len,
 649                struct cap *caps)
 650{
 651        int     i, ofs = 0, res = 0;
 652
 653        if (r->anchored) {
 654                res = match(r, 0, buf, len, &ofs, caps);
 655        } else {
 656                for (i = 0; i < len && res == 0; i++) {
 657                        ofs = i;
 658                        res = match(r, 0, buf, len, &ofs, caps);
 659                }
 660        }
 661
 662        return res;
 663}
 664
 665#ifdef SLRE_TEST
 666#define N_CAPS  5
 667
 668int main(int argc, char *argv[])
 669{
 670        struct slre     slre;
 671        struct cap      caps[N_CAPS];
 672        unsigned char   data[1 * 1024 * 1024];
 673        FILE            *fp;
 674        int             i, res, len;
 675
 676        if (argc < 2) {
 677                fprintf(stderr, "Usage: %s 'slre' <file>\n", argv[0]);
 678                return 1;
 679        }
 680
 681        fp = fopen(argv[2], "rb");
 682        if (fp == NULL) {
 683                fprintf(stderr, "Error: cannot open %s:%s\n",
 684                        argv[2], strerror(errno));
 685                return 1;
 686        }
 687
 688        if (!slre_compile(&slre, argv[1])) {
 689                fprintf(stderr, "Error compiling slre: %s\n", slre.err_str);
 690                return 1;
 691        }
 692
 693        slre_dump(&slre, stderr);
 694
 695        while (fgets(data, sizeof(data), fp) != NULL) {
 696                len = strlen(data);
 697
 698                if ((len > 0) && (data[len-1] == '\n')) {
 699                        data[len-1] = '\0';
 700                        --len;
 701                }
 702
 703                printf("Data = \"%s\"\n", data);
 704
 705                (void) memset(caps, 0, sizeof(caps));
 706
 707                res = slre_match(&slre, data, len, caps);
 708                printf("Result [%d]: %d\n", i, res);
 709
 710                for (i = 0; i < N_CAPS; i++) {
 711                        if (caps[i].len > 0) {
 712                                printf("Substring %d: len=%d  [%.*s]\n", i,
 713                                        caps[i].len,
 714                                        caps[i].len, caps[i].ptr);
 715                        }
 716                }
 717                printf("----------------------------------------------------\n");
 718        }
 719        (void) fclose(fp);
 720
 721        return 0;
 722}
 723#endif /* SLRE_TEST */
 724