uboot/lib/slre.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2004-2005 Sergey Lyubka <valenok@gmail.com>
   3 * All rights reserved
   4 *
   5 * "THE BEER-WARE LICENSE" (Revision 42):
   6 * Sergey Lyubka wrote this file.  As long as you retain this notice you
   7 * can do whatever you want with this stuff. If we meet some day, and you think
   8 * this stuff is worth it, you can buy me a beer in return.
   9 */
  10
  11/*
  12 * Downloaded Sat Nov  5 17:43:06 CET 2011 at
  13 * http://slre.sourceforge.net/1.0/slre.c
  14 */
  15
  16#ifdef SLRE_TEST
  17#include <stdio.h>
  18#include <assert.h>
  19#include <ctype.h>
  20#include <stdlib.h>
  21#include <string.h>
  22#else
  23#include <common.h>
  24#include <linux/ctype.h>
  25#endif /* SLRE_TEST */
  26
  27#include <errno.h>
  28
  29#include <slre.h>
  30
  31enum {END, BRANCH, ANY, EXACT, ANYOF, ANYBUT, OPEN, CLOSE, BOL, EOL,
  32        STAR, PLUS, STARQ, PLUSQ, QUEST, SPACE, NONSPACE, DIGIT};
  33
  34#ifdef SLRE_TEST
  35static struct {
  36        const char      *name;
  37        int             narg;
  38        const char      *flags;
  39} opcodes[] = {
  40        {"END",         0, ""},         /* End of code block or program */
  41        {"BRANCH",      2, "oo"},       /* Alternative operator, "|"    */
  42        {"ANY",         0, ""},         /* Match any character, "."     */
  43        {"EXACT",       2, "d"},        /* Match exact string           */
  44        {"ANYOF",       2, "D"},        /* Match any from set, "[]"     */
  45        {"ANYBUT",      2, "D"},        /* Match any but from set, "[^]"*/
  46        {"OPEN ",       1, "i"},        /* Capture start, "("           */
  47        {"CLOSE",       1, "i"},        /* Capture end, ")"             */
  48        {"BOL",         0, ""},         /* Beginning of string, "^"     */
  49        {"EOL",         0, ""},         /* End of string, "$"           */
  50        {"STAR",        1, "o"},        /* Match zero or more times "*" */
  51        {"PLUS",        1, "o"},        /* Match one or more times, "+" */
  52        {"STARQ",       1, "o"},        /* Non-greedy STAR,  "*?"       */
  53        {"PLUSQ",       1, "o"},        /* Non-greedy PLUS, "+?"        */
  54        {"QUEST",       1, "o"},        /* Match zero or one time, "?"  */
  55        {"SPACE",       0, ""},         /* Match whitespace, "\s"       */
  56        {"NONSPACE",    0, ""},         /* Match non-space, "\S"        */
  57        {"DIGIT",       0, ""}          /* Match digit, "\d"            */
  58};
  59#endif /* SLRE_TEST */
  60
  61/*
  62 * Commands and operands are all unsigned char (1 byte long). All code offsets
  63 * are relative to current address, and positive (always point forward). Data
  64 * offsets are absolute. Commands with operands:
  65 *
  66 * BRANCH offset1 offset2
  67 *      Try to match the code block that follows the BRANCH instruction
  68 *      (code block ends with END). If no match, try to match code block that
  69 *      starts at offset1. If either of these match, jump to offset2.
  70 *
  71 * EXACT data_offset data_length
  72 *      Try to match exact string. String is recorded in data section from
  73 *      data_offset, and has length data_length.
  74 *
  75 * OPEN capture_number
  76 * CLOSE capture_number
  77 *      If the user have passed 'struct cap' array for captures, OPEN
  78 *      records the beginning of the matched substring (cap->ptr), CLOSE
  79 *      sets the length (cap->len) for respective capture_number.
  80 *
  81 * STAR code_offset
  82 * PLUS code_offset
  83 * QUEST code_offset
  84 *      *, +, ?, respectively. Try to gobble as much as possible from the
  85 *      matched buffer, until code block that follows these instructions
  86 *      matches. When the longest possible string is matched,
  87 *      jump to code_offset
  88 *
  89 * STARQ, PLUSQ are non-greedy versions of STAR and PLUS.
  90 */
  91
  92static const char *meta_chars = "|.^$*+?()[\\";
  93
  94#ifdef SLRE_TEST
  95
  96static void
  97print_character_set(FILE *fp, const unsigned char *p, int len)
  98{
  99        int     i;
 100
 101        for (i = 0; i < len; i++) {
 102                if (i > 0)
 103                        (void) fputc(',', fp);
 104                if (p[i] == 0) {
 105                        i++;
 106                        if (p[i] == 0)
 107                                (void) fprintf(fp, "\\x%02x", p[i]);
 108                        else
 109                                (void) fprintf(fp, "%s", opcodes[p[i]].name);
 110                } else if (isprint(p[i])) {
 111                        (void) fputc(p[i], fp);
 112                } else {
 113                        (void) fprintf(fp, "\\x%02x", p[i]);
 114                }
 115        }
 116}
 117
 118void
 119slre_dump(const struct slre *r, FILE *fp)
 120{
 121        int     i, j, ch, op, pc;
 122
 123        for (pc = 0; pc < r->code_size; pc++) {
 124
 125                op = r->code[pc];
 126                (void) fprintf(fp, "%3d %s ", pc, opcodes[op].name);
 127
 128                for (i = 0; opcodes[op].flags[i] != '\0'; i++)
 129                        switch (opcodes[op].flags[i]) {
 130                        case 'i':
 131                                (void) fprintf(fp, "%d ", r->code[pc + 1]);
 132                                pc++;
 133                                break;
 134                        case 'o':
 135                                (void) fprintf(fp, "%d ",
 136                                    pc + r->code[pc + 1] - i);
 137                                pc++;
 138                                break;
 139                        case 'D':
 140                                print_character_set(fp, r->data +
 141                                    r->code[pc + 1], r->code[pc + 2]);
 142                                pc += 2;
 143                                break;
 144                        case 'd':
 145                                (void) fputc('"', fp);
 146                                for (j = 0; j < r->code[pc + 2]; j++) {
 147                                        ch = r->data[r->code[pc + 1] + j];
 148                                        if (isprint(ch)) {
 149                                                (void) fputc(ch, fp);
 150                                        } else {
 151                                                (void) fprintf(fp,
 152                                                        "\\x%02x", ch);
 153                                        }
 154                                }
 155                                (void) fputc('"', fp);
 156                                pc += 2;
 157                                break;
 158                        }
 159
 160                (void) fputc('\n', fp);
 161        }
 162}
 163#endif /* SLRE_TEST */
 164
 165static void
 166set_jump_offset(struct slre *r, int pc, int offset)
 167{
 168        assert(offset < r->code_size);
 169
 170        if (r->code_size - offset > 0xff)
 171                r->err_str = "Jump offset is too big";
 172        else
 173                r->code[pc] = (unsigned char) (r->code_size - offset);
 174}
 175
 176static void
 177emit(struct slre *r, int code)
 178{
 179        if (r->code_size >= (int) (sizeof(r->code) / sizeof(r->code[0])))
 180                r->err_str = "RE is too long (code overflow)";
 181        else
 182                r->code[r->code_size++] = (unsigned char) code;
 183}
 184
 185static void
 186store_char_in_data(struct slre *r, int ch)
 187{
 188        if (r->data_size >= (int) sizeof(r->data))
 189                r->err_str = "RE is too long (data overflow)";
 190        else
 191                r->data[r->data_size++] = ch;
 192}
 193
 194static void
 195exact(struct slre *r, const char **re)
 196{
 197        int     old_data_size = r->data_size;
 198
 199        while (**re != '\0' && (strchr(meta_chars, **re)) == NULL)
 200                store_char_in_data(r, *(*re)++);
 201
 202        emit(r, EXACT);
 203        emit(r, old_data_size);
 204        emit(r, r->data_size - old_data_size);
 205}
 206
 207static int
 208get_escape_char(const char **re)
 209{
 210        int     res;
 211
 212        switch (*(*re)++) {
 213        case 'n':
 214                res = '\n';
 215                break;
 216        case 'r':
 217                res = '\r';
 218                break;
 219        case 't':
 220                res = '\t';
 221                break;
 222        case '0':
 223                res = 0;
 224                break;
 225        case 'S':
 226                res = NONSPACE << 8;
 227                break;
 228        case 's':
 229                res = SPACE << 8;
 230                break;
 231        case 'd':
 232                res = DIGIT << 8;
 233                break;
 234        default:
 235                res = (*re)[-1];
 236                break;
 237        }
 238
 239        return res;
 240}
 241
 242static void
 243anyof(struct slre *r, const char **re)
 244{
 245        int     esc, old_data_size = r->data_size, op = ANYOF;
 246
 247        if (**re == '^') {
 248                op = ANYBUT;
 249                (*re)++;
 250        }
 251
 252        while (**re != '\0')
 253
 254                switch (*(*re)++) {
 255                case ']':
 256                        emit(r, op);
 257                        emit(r, old_data_size);
 258                        emit(r, r->data_size - old_data_size);
 259                        return;
 260                        /* NOTREACHED */
 261                        break;
 262                case '\\':
 263                        esc = get_escape_char(re);
 264                        if ((esc & 0xff) == 0) {
 265                                store_char_in_data(r, 0);
 266                                store_char_in_data(r, esc >> 8);
 267                        } else {
 268                                store_char_in_data(r, esc);
 269                        }
 270                        break;
 271                default:
 272                        store_char_in_data(r, (*re)[-1]);
 273                        break;
 274                }
 275
 276        r->err_str = "No closing ']' bracket";
 277}
 278
 279static void
 280relocate(struct slre *r, int begin, int shift)
 281{
 282        emit(r, END);
 283        memmove(r->code + begin + shift, r->code + begin, r->code_size - begin);
 284        r->code_size += shift;
 285}
 286
 287static void
 288quantifier(struct slre *r, int prev, int op)
 289{
 290        if (r->code[prev] == EXACT && r->code[prev + 2] > 1) {
 291                r->code[prev + 2]--;
 292                emit(r, EXACT);
 293                emit(r, r->code[prev + 1] + r->code[prev + 2]);
 294                emit(r, 1);
 295                prev = r->code_size - 3;
 296        }
 297        relocate(r, prev, 2);
 298        r->code[prev] = op;
 299        set_jump_offset(r, prev + 1, prev);
 300}
 301
 302static void
 303exact_one_char(struct slre *r, int ch)
 304{
 305        emit(r, EXACT);
 306        emit(r, r->data_size);
 307        emit(r, 1);
 308        store_char_in_data(r, ch);
 309}
 310
 311static void
 312fixup_branch(struct slre *r, int fixup)
 313{
 314        if (fixup > 0) {
 315                emit(r, END);
 316                set_jump_offset(r, fixup, fixup - 2);
 317        }
 318}
 319
 320static void
 321compile(struct slre *r, const char **re)
 322{
 323        int     op, esc, branch_start, last_op, fixup, cap_no, level;
 324
 325        fixup = 0;
 326        level = r->num_caps;
 327        branch_start = last_op = r->code_size;
 328
 329        for (;;)
 330                switch (*(*re)++) {
 331                case '\0':
 332                        (*re)--;
 333                        return;
 334                        /* NOTREACHED */
 335                        break;
 336                case '^':
 337                        emit(r, BOL);
 338                        break;
 339                case '$':
 340                        emit(r, EOL);
 341                        break;
 342                case '.':
 343                        last_op = r->code_size;
 344                        emit(r, ANY);
 345                        break;
 346                case '[':
 347                        last_op = r->code_size;
 348                        anyof(r, re);
 349                        break;
 350                case '\\':
 351                        last_op = r->code_size;
 352                        esc = get_escape_char(re);
 353                        if (esc & 0xff00)
 354                                emit(r, esc >> 8);
 355                        else
 356                                exact_one_char(r, esc);
 357                        break;
 358                case '(':
 359                        last_op = r->code_size;
 360                        cap_no = ++r->num_caps;
 361                        emit(r, OPEN);
 362                        emit(r, cap_no);
 363
 364                        compile(r, re);
 365                        if (*(*re)++ != ')') {
 366                                r->err_str = "No closing bracket";
 367                                return;
 368                        }
 369
 370                        emit(r, CLOSE);
 371                        emit(r, cap_no);
 372                        break;
 373                case ')':
 374                        (*re)--;
 375                        fixup_branch(r, fixup);
 376                        if (level == 0) {
 377                                r->err_str = "Unbalanced brackets";
 378                                return;
 379                        }
 380                        return;
 381                        /* NOTREACHED */
 382                        break;
 383                case '+':
 384                case '*':
 385                        op = (*re)[-1] == '*' ? STAR : PLUS;
 386                        if (**re == '?') {
 387                                (*re)++;
 388                                op = op == STAR ? STARQ : PLUSQ;
 389                        }
 390                        quantifier(r, last_op, op);
 391                        break;
 392                case '?':
 393                        quantifier(r, last_op, QUEST);
 394                        break;
 395                case '|':
 396                        fixup_branch(r, fixup);
 397                        relocate(r, branch_start, 3);
 398                        r->code[branch_start] = BRANCH;
 399                        set_jump_offset(r, branch_start + 1, branch_start);
 400                        fixup = branch_start + 2;
 401                        r->code[fixup] = 0xff;
 402                        break;
 403                default:
 404                        (*re)--;
 405                        last_op = r->code_size;
 406                        exact(r, re);
 407                        break;
 408                }
 409}
 410
 411int
 412slre_compile(struct slre *r, const char *re)
 413{
 414        r->err_str = NULL;
 415        r->code_size = r->data_size = r->num_caps = r->anchored = 0;
 416
 417        if (*re == '^')
 418                r->anchored++;
 419
 420        emit(r, OPEN);  /* This will capture what matches full RE */
 421        emit(r, 0);
 422
 423        while (*re != '\0')
 424                compile(r, &re);
 425
 426        if (r->code[2] == BRANCH)
 427                fixup_branch(r, 4);
 428
 429        emit(r, CLOSE);
 430        emit(r, 0);
 431        emit(r, END);
 432
 433        return (r->err_str == NULL ? 1 : 0);
 434}
 435
 436static int match(const struct slre *, int,
 437                const char *, int, int *, struct cap *);
 438
 439static void
 440loop_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
 441{
 442        int     saved_offset, matched_offset;
 443
 444        saved_offset = matched_offset = *ofs;
 445
 446        while (match(r, pc + 2, s, len, ofs, NULL)) {
 447                saved_offset = *ofs;
 448                if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
 449                        matched_offset = saved_offset;
 450                *ofs = saved_offset;
 451        }
 452
 453        *ofs = matched_offset;
 454}
 455
 456static void
 457loop_non_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
 458{
 459        int     saved_offset = *ofs;
 460
 461        while (match(r, pc + 2, s, len, ofs, NULL)) {
 462                saved_offset = *ofs;
 463                if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
 464                        break;
 465        }
 466
 467        *ofs = saved_offset;
 468}
 469
 470static int
 471is_any_of(const unsigned char *p, int len, const char *s, int *ofs)
 472{
 473        int     i, ch;
 474
 475        ch = s[*ofs];
 476
 477        for (i = 0; i < len; i++)
 478                if (p[i] == ch) {
 479                        (*ofs)++;
 480                        return 1;
 481                }
 482
 483        return 0;
 484}
 485
 486static int
 487is_any_but(const unsigned char *p, int len, const char *s, int *ofs)
 488{
 489        int     i, ch;
 490
 491        ch = s[*ofs];
 492
 493        for (i = 0; i < len; i++) {
 494                if (p[i] == ch)
 495                        return 0;
 496        }
 497
 498        (*ofs)++;
 499        return 1;
 500}
 501
 502static int
 503match(const struct slre *r, int pc, const char *s, int len,
 504                int *ofs, struct cap *caps)
 505{
 506        int     n, saved_offset, res = 1;
 507
 508        while (res && r->code[pc] != END) {
 509
 510                assert(pc < r->code_size);
 511                assert(pc < (int) (sizeof(r->code) / sizeof(r->code[0])));
 512
 513                switch (r->code[pc]) {
 514                case BRANCH:
 515                        saved_offset = *ofs;
 516                        res = match(r, pc + 3, s, len, ofs, caps);
 517                        if (res == 0) {
 518                                *ofs = saved_offset;
 519                                res = match(r, pc + r->code[pc + 1],
 520                                    s, len, ofs, caps);
 521                        }
 522                        pc += r->code[pc + 2];
 523                        break;
 524                case EXACT:
 525                        res = 0;
 526                        n = r->code[pc + 2];    /* String length */
 527                        if (n <= len - *ofs && !memcmp(s + *ofs, r->data +
 528                            r->code[pc + 1], n)) {
 529                                (*ofs) += n;
 530                                res = 1;
 531                        }
 532                        pc += 3;
 533                        break;
 534                case QUEST:
 535                        res = 1;
 536                        saved_offset = *ofs;
 537                        if (!match(r, pc + 2, s, len, ofs, caps))
 538                                *ofs = saved_offset;
 539                        pc += r->code[pc + 1];
 540                        break;
 541                case STAR:
 542                        res = 1;
 543                        loop_greedy(r, pc, s, len, ofs);
 544                        pc += r->code[pc + 1];
 545                        break;
 546                case STARQ:
 547                        res = 1;
 548                        loop_non_greedy(r, pc, s, len, ofs);
 549                        pc += r->code[pc + 1];
 550                        break;
 551                case PLUS:
 552                        res = match(r, pc + 2, s, len, ofs, caps);
 553                        if (res == 0)
 554                                break;
 555
 556                        loop_greedy(r, pc, s, len, ofs);
 557                        pc += r->code[pc + 1];
 558                        break;
 559                case PLUSQ:
 560                        res = match(r, pc + 2, s, len, ofs, caps);
 561                        if (res == 0)
 562                                break;
 563
 564                        loop_non_greedy(r, pc, s, len, ofs);
 565                        pc += r->code[pc + 1];
 566                        break;
 567                case SPACE:
 568                        res = 0;
 569                        if (*ofs < len && isspace(((unsigned char *)s)[*ofs])) {
 570                                (*ofs)++;
 571                                res = 1;
 572                        }
 573                        pc++;
 574                        break;
 575                case NONSPACE:
 576                        res = 0;
 577                        if (*ofs < len &&
 578                                        !isspace(((unsigned char *)s)[*ofs])) {
 579                                (*ofs)++;
 580                                res = 1;
 581                        }
 582                        pc++;
 583                        break;
 584                case DIGIT:
 585                        res = 0;
 586                        if (*ofs < len && isdigit(((unsigned char *)s)[*ofs])) {
 587                                (*ofs)++;
 588                                res = 1;
 589                        }
 590                        pc++;
 591                        break;
 592                case ANY:
 593                        res = 0;
 594                        if (*ofs < len) {
 595                                (*ofs)++;
 596                                res = 1;
 597                        }
 598                        pc++;
 599                        break;
 600                case ANYOF:
 601                        res = 0;
 602                        if (*ofs < len)
 603                                res = is_any_of(r->data + r->code[pc + 1],
 604                                        r->code[pc + 2], s, ofs);
 605                        pc += 3;
 606                        break;
 607                case ANYBUT:
 608                        res = 0;
 609                        if (*ofs < len)
 610                                res = is_any_but(r->data + r->code[pc + 1],
 611                                        r->code[pc + 2], s, ofs);
 612                        pc += 3;
 613                        break;
 614                case BOL:
 615                        res = *ofs == 0 ? 1 : 0;
 616                        pc++;
 617                        break;
 618                case EOL:
 619                        res = *ofs == len ? 1 : 0;
 620                        pc++;
 621                        break;
 622                case OPEN:
 623                        if (caps != NULL)
 624                                caps[r->code[pc + 1]].ptr = s + *ofs;
 625                        pc += 2;
 626                        break;
 627                case CLOSE:
 628                        if (caps != NULL)
 629                                caps[r->code[pc + 1]].len = (s + *ofs) -
 630                                    caps[r->code[pc + 1]].ptr;
 631                        pc += 2;
 632                        break;
 633                case END:
 634                        pc++;
 635                        break;
 636                default:
 637                        printf("unknown cmd (%d) at %d\n", r->code[pc], pc);
 638                        assert(0);
 639                        break;
 640                }
 641        }
 642
 643        return res;
 644}
 645
 646int
 647slre_match(const struct slre *r, const char *buf, int len,
 648                struct cap *caps)
 649{
 650        int     i, ofs = 0, res = 0;
 651
 652        if (r->anchored) {
 653                res = match(r, 0, buf, len, &ofs, caps);
 654        } else {
 655                for (i = 0; i < len && res == 0; i++) {
 656                        ofs = i;
 657                        res = match(r, 0, buf, len, &ofs, caps);
 658                }
 659        }
 660
 661        return res;
 662}
 663
 664#ifdef SLRE_TEST
 665#define N_CAPS  5
 666
 667int main(int argc, char *argv[])
 668{
 669        struct slre     slre;
 670        struct cap      caps[N_CAPS];
 671        unsigned char   data[1 * 1024 * 1024];
 672        FILE            *fp;
 673        int             i, res, len;
 674
 675        if (argc < 2) {
 676                fprintf(stderr, "Usage: %s 'slre' <file>\n", argv[0]);
 677                return 1;
 678        }
 679
 680        fp = fopen(argv[2], "rb");
 681        if (fp == NULL) {
 682                fprintf(stderr, "Error: cannot open %s:%s\n",
 683                        argv[2], strerror(errno));
 684                return 1;
 685        }
 686
 687        if (!slre_compile(&slre, argv[1])) {
 688                fprintf(stderr, "Error compiling slre: %s\n", slre.err_str);
 689                return 1;
 690        }
 691
 692        slre_dump(&slre, stderr);
 693
 694        while (fgets(data, sizeof(data), fp) != NULL) {
 695                len = strlen(data);
 696
 697                if ((len > 0) && (data[len-1] == '\n')) {
 698                        data[len-1] = '\0';
 699                        --len;
 700                }
 701
 702                printf("Data = \"%s\"\n", data);
 703
 704                (void) memset(caps, 0, sizeof(caps));
 705
 706                res = 0;
 707
 708                res = slre_match(&slre, data, len, caps);
 709                printf("Result [%d]: %d\n", i, res);
 710
 711                for (i = 0; i < N_CAPS; i++) {
 712                        if (caps[i].len > 0) {
 713                                printf("Substring %d: len=%d  [%.*s]\n", i,
 714                                        caps[i].len,
 715                                        caps[i].len, caps[i].ptr);
 716                        }
 717                }
 718                printf("----------------------------------------------------\n");
 719        }
 720        (void) fclose(fp);
 721
 722        return 0;
 723}
 724#endif /* SLRE_TEST */
 725