qemu/json-lexer.c
<<
>>
Prefs
   1/*
   2 * JSON lexer
   3 *
   4 * Copyright IBM, Corp. 2009
   5 *
   6 * Authors:
   7 *  Anthony Liguori   <aliguori@us.ibm.com>
   8 *
   9 * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
  10 * See the COPYING.LIB file in the top-level directory.
  11 *
  12 */
  13
  14#include "qstring.h"
  15#include "qlist.h"
  16#include "qdict.h"
  17#include "qint.h"
  18#include "qemu-common.h"
  19#include "json-lexer.h"
  20
  21#define MAX_TOKEN_SIZE (64ULL << 20)
  22
  23/*
  24 * \"([^\\\"]|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*\"
  25 * '([^\\']|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*'
  26 * 0|([1-9][0-9]*(.[0-9]+)?([eE]([-+])?[0-9]+))
  27 * [{}\[\],:]
  28 * [a-z]+
  29 *
  30 */
  31
  32enum json_lexer_state {
  33    IN_ERROR = 0,
  34    IN_DQ_UCODE3,
  35    IN_DQ_UCODE2,
  36    IN_DQ_UCODE1,
  37    IN_DQ_UCODE0,
  38    IN_DQ_STRING_ESCAPE,
  39    IN_DQ_STRING,
  40    IN_SQ_UCODE3,
  41    IN_SQ_UCODE2,
  42    IN_SQ_UCODE1,
  43    IN_SQ_UCODE0,
  44    IN_SQ_STRING_ESCAPE,
  45    IN_SQ_STRING,
  46    IN_ZERO,
  47    IN_DIGITS,
  48    IN_DIGIT,
  49    IN_EXP_E,
  50    IN_MANTISSA,
  51    IN_MANTISSA_DIGITS,
  52    IN_NONZERO_NUMBER,
  53    IN_NEG_NONZERO_NUMBER,
  54    IN_KEYWORD,
  55    IN_ESCAPE,
  56    IN_ESCAPE_L,
  57    IN_ESCAPE_LL,
  58    IN_ESCAPE_I,
  59    IN_ESCAPE_I6,
  60    IN_ESCAPE_I64,
  61    IN_WHITESPACE,
  62    IN_START,
  63};
  64
  65#define TERMINAL(state) [0 ... 0x7F] = (state)
  66
  67/* Return whether TERMINAL is a terminal state and the transition to it
  68   from OLD_STATE required lookahead.  This happens whenever the table
  69   below uses the TERMINAL macro.  */
  70#define TERMINAL_NEEDED_LOOKAHEAD(old_state, terminal) \
  71            (json_lexer[(old_state)][0] == (terminal))
  72
  73static const uint8_t json_lexer[][256] =  {
  74    /* double quote string */
  75    [IN_DQ_UCODE3] = {
  76        ['0' ... '9'] = IN_DQ_STRING,
  77        ['a' ... 'f'] = IN_DQ_STRING,
  78        ['A' ... 'F'] = IN_DQ_STRING,
  79    },
  80    [IN_DQ_UCODE2] = {
  81        ['0' ... '9'] = IN_DQ_UCODE3,
  82        ['a' ... 'f'] = IN_DQ_UCODE3,
  83        ['A' ... 'F'] = IN_DQ_UCODE3,
  84    },
  85    [IN_DQ_UCODE1] = {
  86        ['0' ... '9'] = IN_DQ_UCODE2,
  87        ['a' ... 'f'] = IN_DQ_UCODE2,
  88        ['A' ... 'F'] = IN_DQ_UCODE2,
  89    },
  90    [IN_DQ_UCODE0] = {
  91        ['0' ... '9'] = IN_DQ_UCODE1,
  92        ['a' ... 'f'] = IN_DQ_UCODE1,
  93        ['A' ... 'F'] = IN_DQ_UCODE1,
  94    },
  95    [IN_DQ_STRING_ESCAPE] = {
  96        ['b'] = IN_DQ_STRING,
  97        ['f'] =  IN_DQ_STRING,
  98        ['n'] =  IN_DQ_STRING,
  99        ['r'] =  IN_DQ_STRING,
 100        ['t'] =  IN_DQ_STRING,
 101        ['/'] = IN_DQ_STRING,
 102        ['\\'] = IN_DQ_STRING,
 103        ['\''] = IN_DQ_STRING,
 104        ['\"'] = IN_DQ_STRING,
 105        ['u'] = IN_DQ_UCODE0,
 106    },
 107    [IN_DQ_STRING] = {
 108        [1 ... 0xBF] = IN_DQ_STRING,
 109        [0xC2 ... 0xF4] = IN_DQ_STRING,
 110        ['\\'] = IN_DQ_STRING_ESCAPE,
 111        ['"'] = JSON_STRING,
 112    },
 113
 114    /* single quote string */
 115    [IN_SQ_UCODE3] = {
 116        ['0' ... '9'] = IN_SQ_STRING,
 117        ['a' ... 'f'] = IN_SQ_STRING,
 118        ['A' ... 'F'] = IN_SQ_STRING,
 119    },
 120    [IN_SQ_UCODE2] = {
 121        ['0' ... '9'] = IN_SQ_UCODE3,
 122        ['a' ... 'f'] = IN_SQ_UCODE3,
 123        ['A' ... 'F'] = IN_SQ_UCODE3,
 124    },
 125    [IN_SQ_UCODE1] = {
 126        ['0' ... '9'] = IN_SQ_UCODE2,
 127        ['a' ... 'f'] = IN_SQ_UCODE2,
 128        ['A' ... 'F'] = IN_SQ_UCODE2,
 129    },
 130    [IN_SQ_UCODE0] = {
 131        ['0' ... '9'] = IN_SQ_UCODE1,
 132        ['a' ... 'f'] = IN_SQ_UCODE1,
 133        ['A' ... 'F'] = IN_SQ_UCODE1,
 134    },
 135    [IN_SQ_STRING_ESCAPE] = {
 136        ['b'] = IN_SQ_STRING,
 137        ['f'] =  IN_SQ_STRING,
 138        ['n'] =  IN_SQ_STRING,
 139        ['r'] =  IN_SQ_STRING,
 140        ['t'] =  IN_SQ_STRING,
 141        ['/'] = IN_DQ_STRING,
 142        ['\\'] = IN_DQ_STRING,
 143        ['\''] = IN_SQ_STRING,
 144        ['\"'] = IN_SQ_STRING,
 145        ['u'] = IN_SQ_UCODE0,
 146    },
 147    [IN_SQ_STRING] = {
 148        [1 ... 0xBF] = IN_SQ_STRING,
 149        [0xC2 ... 0xF4] = IN_SQ_STRING,
 150        ['\\'] = IN_SQ_STRING_ESCAPE,
 151        ['\''] = JSON_STRING,
 152    },
 153
 154    /* Zero */
 155    [IN_ZERO] = {
 156        TERMINAL(JSON_INTEGER),
 157        ['0' ... '9'] = IN_ERROR,
 158        ['.'] = IN_MANTISSA,
 159    },
 160
 161    /* Float */
 162    [IN_DIGITS] = {
 163        TERMINAL(JSON_FLOAT),
 164        ['0' ... '9'] = IN_DIGITS,
 165    },
 166
 167    [IN_DIGIT] = {
 168        ['0' ... '9'] = IN_DIGITS,
 169    },
 170
 171    [IN_EXP_E] = {
 172        ['-'] = IN_DIGIT,
 173        ['+'] = IN_DIGIT,
 174        ['0' ... '9'] = IN_DIGITS,
 175    },
 176
 177    [IN_MANTISSA_DIGITS] = {
 178        TERMINAL(JSON_FLOAT),
 179        ['0' ... '9'] = IN_MANTISSA_DIGITS,
 180        ['e'] = IN_EXP_E,
 181        ['E'] = IN_EXP_E,
 182    },
 183
 184    [IN_MANTISSA] = {
 185        ['0' ... '9'] = IN_MANTISSA_DIGITS,
 186    },
 187
 188    /* Number */
 189    [IN_NONZERO_NUMBER] = {
 190        TERMINAL(JSON_INTEGER),
 191        ['0' ... '9'] = IN_NONZERO_NUMBER,
 192        ['e'] = IN_EXP_E,
 193        ['E'] = IN_EXP_E,
 194        ['.'] = IN_MANTISSA,
 195    },
 196
 197    [IN_NEG_NONZERO_NUMBER] = {
 198        ['0'] = IN_ZERO,
 199        ['1' ... '9'] = IN_NONZERO_NUMBER,
 200    },
 201
 202    /* keywords */
 203    [IN_KEYWORD] = {
 204        TERMINAL(JSON_KEYWORD),
 205        ['a' ... 'z'] = IN_KEYWORD,
 206    },
 207
 208    /* whitespace */
 209    [IN_WHITESPACE] = {
 210        TERMINAL(JSON_SKIP),
 211        [' '] = IN_WHITESPACE,
 212        ['\t'] = IN_WHITESPACE,
 213        ['\r'] = IN_WHITESPACE,
 214        ['\n'] = IN_WHITESPACE,
 215    },        
 216
 217    /* escape */
 218    [IN_ESCAPE_LL] = {
 219        ['d'] = JSON_ESCAPE,
 220    },
 221
 222    [IN_ESCAPE_L] = {
 223        ['d'] = JSON_ESCAPE,
 224        ['l'] = IN_ESCAPE_LL,
 225    },
 226
 227    [IN_ESCAPE_I64] = {
 228        ['d'] = JSON_ESCAPE,
 229    },
 230
 231    [IN_ESCAPE_I6] = {
 232        ['4'] = IN_ESCAPE_I64,
 233    },
 234
 235    [IN_ESCAPE_I] = {
 236        ['6'] = IN_ESCAPE_I6,
 237    },
 238
 239    [IN_ESCAPE] = {
 240        ['d'] = JSON_ESCAPE,
 241        ['i'] = JSON_ESCAPE,
 242        ['p'] = JSON_ESCAPE,
 243        ['s'] = JSON_ESCAPE,
 244        ['f'] = JSON_ESCAPE,
 245        ['l'] = IN_ESCAPE_L,
 246        ['I'] = IN_ESCAPE_I,
 247    },
 248
 249    /* top level rule */
 250    [IN_START] = {
 251        ['"'] = IN_DQ_STRING,
 252        ['\''] = IN_SQ_STRING,
 253        ['0'] = IN_ZERO,
 254        ['1' ... '9'] = IN_NONZERO_NUMBER,
 255        ['-'] = IN_NEG_NONZERO_NUMBER,
 256        ['{'] = JSON_OPERATOR,
 257        ['}'] = JSON_OPERATOR,
 258        ['['] = JSON_OPERATOR,
 259        [']'] = JSON_OPERATOR,
 260        [','] = JSON_OPERATOR,
 261        [':'] = JSON_OPERATOR,
 262        ['a' ... 'z'] = IN_KEYWORD,
 263        ['%'] = IN_ESCAPE,
 264        [' '] = IN_WHITESPACE,
 265        ['\t'] = IN_WHITESPACE,
 266        ['\r'] = IN_WHITESPACE,
 267        ['\n'] = IN_WHITESPACE,
 268    },
 269};
 270
 271void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func)
 272{
 273    lexer->emit = func;
 274    lexer->state = IN_START;
 275    lexer->token = qstring_new();
 276    lexer->x = lexer->y = 0;
 277}
 278
 279static int json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush)
 280{
 281    int char_consumed, new_state;
 282
 283    lexer->x++;
 284    if (ch == '\n') {
 285        lexer->x = 0;
 286        lexer->y++;
 287    }
 288
 289    do {
 290        new_state = json_lexer[lexer->state][(uint8_t)ch];
 291        char_consumed = !TERMINAL_NEEDED_LOOKAHEAD(lexer->state, new_state);
 292        if (char_consumed) {
 293            qstring_append_chr(lexer->token, ch);
 294        }
 295
 296        switch (new_state) {
 297        case JSON_OPERATOR:
 298        case JSON_ESCAPE:
 299        case JSON_INTEGER:
 300        case JSON_FLOAT:
 301        case JSON_KEYWORD:
 302        case JSON_STRING:
 303            lexer->emit(lexer, lexer->token, new_state, lexer->x, lexer->y);
 304        case JSON_SKIP:
 305            QDECREF(lexer->token);
 306            lexer->token = qstring_new();
 307            new_state = IN_START;
 308            break;
 309        case IN_ERROR:
 310            /* XXX: To avoid having previous bad input leaving the parser in an
 311             * unresponsive state where we consume unpredictable amounts of
 312             * subsequent "good" input, percolate this error state up to the
 313             * tokenizer/parser by forcing a NULL object to be emitted, then
 314             * reset state.
 315             *
 316             * Also note that this handling is required for reliable channel
 317             * negotiation between QMP and the guest agent, since chr(0xFF)
 318             * is placed at the beginning of certain events to ensure proper
 319             * delivery when the channel is in an unknown state. chr(0xFF) is
 320             * never a valid ASCII/UTF-8 sequence, so this should reliably
 321             * induce an error/flush state.
 322             */
 323            lexer->emit(lexer, lexer->token, JSON_ERROR, lexer->x, lexer->y);
 324            QDECREF(lexer->token);
 325            lexer->token = qstring_new();
 326            new_state = IN_START;
 327            lexer->state = new_state;
 328            return 0;
 329        default:
 330            break;
 331        }
 332        lexer->state = new_state;
 333    } while (!char_consumed && !flush);
 334
 335    /* Do not let a single token grow to an arbitrarily large size,
 336     * this is a security consideration.
 337     */
 338    if (lexer->token->length > MAX_TOKEN_SIZE) {
 339        lexer->emit(lexer, lexer->token, lexer->state, lexer->x, lexer->y);
 340        QDECREF(lexer->token);
 341        lexer->token = qstring_new();
 342        lexer->state = IN_START;
 343    }
 344
 345    return 0;
 346}
 347
 348int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size)
 349{
 350    size_t i;
 351
 352    for (i = 0; i < size; i++) {
 353        int err;
 354
 355        err = json_lexer_feed_char(lexer, buffer[i], false);
 356        if (err < 0) {
 357            return err;
 358        }
 359    }
 360
 361    return 0;
 362}
 363
 364int json_lexer_flush(JSONLexer *lexer)
 365{
 366    return lexer->state == IN_START ? 0 : json_lexer_feed_char(lexer, 0, true);
 367}
 368
 369void json_lexer_destroy(JSONLexer *lexer)
 370{
 371    QDECREF(lexer->token);
 372}
 373