qemu/qobject/json-lexer.c
<<
>>
Prefs
   1/*
   2 * JSON lexer
   3 *
   4 * Copyright IBM, Corp. 2009
   5 *
   6 * Authors:
   7 *  Anthony Liguori   <aliguori@us.ibm.com>
   8 *
   9 * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
  10 * See the COPYING.LIB file in the top-level directory.
  11 *
  12 */
  13
  14#include "qemu/osdep.h"
  15#include "qemu-common.h"
  16#include "qapi/qmp/json-lexer.h"
  17
  18#define MAX_TOKEN_SIZE (64ULL << 20)
  19
  20/*
  21 * \"([^\\\"]|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*\"
  22 * '([^\\']|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*'
  23 * 0|([1-9][0-9]*(.[0-9]+)?([eE]([-+])?[0-9]+))
  24 * [{}\[\],:]
  25 * [a-z]+
  26 *
  27 */
  28
  29enum json_lexer_state {
  30    IN_ERROR = 0,               /* must really be 0, see json_lexer[] */
  31    IN_DQ_UCODE3,
  32    IN_DQ_UCODE2,
  33    IN_DQ_UCODE1,
  34    IN_DQ_UCODE0,
  35    IN_DQ_STRING_ESCAPE,
  36    IN_DQ_STRING,
  37    IN_SQ_UCODE3,
  38    IN_SQ_UCODE2,
  39    IN_SQ_UCODE1,
  40    IN_SQ_UCODE0,
  41    IN_SQ_STRING_ESCAPE,
  42    IN_SQ_STRING,
  43    IN_ZERO,
  44    IN_DIGITS,
  45    IN_DIGIT,
  46    IN_EXP_E,
  47    IN_MANTISSA,
  48    IN_MANTISSA_DIGITS,
  49    IN_NONZERO_NUMBER,
  50    IN_NEG_NONZERO_NUMBER,
  51    IN_KEYWORD,
  52    IN_ESCAPE,
  53    IN_ESCAPE_L,
  54    IN_ESCAPE_LL,
  55    IN_ESCAPE_I,
  56    IN_ESCAPE_I6,
  57    IN_ESCAPE_I64,
  58    IN_WHITESPACE,
  59    IN_START,
  60};
  61
  62QEMU_BUILD_BUG_ON((int)JSON_MIN <= (int)IN_START);
  63
  64#define TERMINAL(state) [0 ... 0x7F] = (state)
  65
  66/* Return whether TERMINAL is a terminal state and the transition to it
  67   from OLD_STATE required lookahead.  This happens whenever the table
  68   below uses the TERMINAL macro.  */
  69#define TERMINAL_NEEDED_LOOKAHEAD(old_state, terminal) \
  70            (json_lexer[(old_state)][0] == (terminal))
  71
  72static const uint8_t json_lexer[][256] =  {
  73    /* Relies on default initialization to IN_ERROR! */
  74
  75    /* double quote string */
  76    [IN_DQ_UCODE3] = {
  77        ['0' ... '9'] = IN_DQ_STRING,
  78        ['a' ... 'f'] = IN_DQ_STRING,
  79        ['A' ... 'F'] = IN_DQ_STRING,
  80    },
  81    [IN_DQ_UCODE2] = {
  82        ['0' ... '9'] = IN_DQ_UCODE3,
  83        ['a' ... 'f'] = IN_DQ_UCODE3,
  84        ['A' ... 'F'] = IN_DQ_UCODE3,
  85    },
  86    [IN_DQ_UCODE1] = {
  87        ['0' ... '9'] = IN_DQ_UCODE2,
  88        ['a' ... 'f'] = IN_DQ_UCODE2,
  89        ['A' ... 'F'] = IN_DQ_UCODE2,
  90    },
  91    [IN_DQ_UCODE0] = {
  92        ['0' ... '9'] = IN_DQ_UCODE1,
  93        ['a' ... 'f'] = IN_DQ_UCODE1,
  94        ['A' ... 'F'] = IN_DQ_UCODE1,
  95    },
  96    [IN_DQ_STRING_ESCAPE] = {
  97        ['b'] = IN_DQ_STRING,
  98        ['f'] =  IN_DQ_STRING,
  99        ['n'] =  IN_DQ_STRING,
 100        ['r'] =  IN_DQ_STRING,
 101        ['t'] =  IN_DQ_STRING,
 102        ['/'] = IN_DQ_STRING,
 103        ['\\'] = IN_DQ_STRING,
 104        ['\''] = IN_DQ_STRING,
 105        ['\"'] = IN_DQ_STRING,
 106        ['u'] = IN_DQ_UCODE0,
 107    },
 108    [IN_DQ_STRING] = {
 109        [1 ... 0xBF] = IN_DQ_STRING,
 110        [0xC2 ... 0xF4] = IN_DQ_STRING,
 111        ['\\'] = IN_DQ_STRING_ESCAPE,
 112        ['"'] = JSON_STRING,
 113    },
 114
 115    /* single quote string */
 116    [IN_SQ_UCODE3] = {
 117        ['0' ... '9'] = IN_SQ_STRING,
 118        ['a' ... 'f'] = IN_SQ_STRING,
 119        ['A' ... 'F'] = IN_SQ_STRING,
 120    },
 121    [IN_SQ_UCODE2] = {
 122        ['0' ... '9'] = IN_SQ_UCODE3,
 123        ['a' ... 'f'] = IN_SQ_UCODE3,
 124        ['A' ... 'F'] = IN_SQ_UCODE3,
 125    },
 126    [IN_SQ_UCODE1] = {
 127        ['0' ... '9'] = IN_SQ_UCODE2,
 128        ['a' ... 'f'] = IN_SQ_UCODE2,
 129        ['A' ... 'F'] = IN_SQ_UCODE2,
 130    },
 131    [IN_SQ_UCODE0] = {
 132        ['0' ... '9'] = IN_SQ_UCODE1,
 133        ['a' ... 'f'] = IN_SQ_UCODE1,
 134        ['A' ... 'F'] = IN_SQ_UCODE1,
 135    },
 136    [IN_SQ_STRING_ESCAPE] = {
 137        ['b'] = IN_SQ_STRING,
 138        ['f'] =  IN_SQ_STRING,
 139        ['n'] =  IN_SQ_STRING,
 140        ['r'] =  IN_SQ_STRING,
 141        ['t'] =  IN_SQ_STRING,
 142        ['/'] = IN_SQ_STRING,
 143        ['\\'] = IN_SQ_STRING,
 144        ['\''] = IN_SQ_STRING,
 145        ['\"'] = IN_SQ_STRING,
 146        ['u'] = IN_SQ_UCODE0,
 147    },
 148    [IN_SQ_STRING] = {
 149        [1 ... 0xBF] = IN_SQ_STRING,
 150        [0xC2 ... 0xF4] = IN_SQ_STRING,
 151        ['\\'] = IN_SQ_STRING_ESCAPE,
 152        ['\''] = JSON_STRING,
 153    },
 154
 155    /* Zero */
 156    [IN_ZERO] = {
 157        TERMINAL(JSON_INTEGER),
 158        ['0' ... '9'] = IN_ERROR,
 159        ['.'] = IN_MANTISSA,
 160    },
 161
 162    /* Float */
 163    [IN_DIGITS] = {
 164        TERMINAL(JSON_FLOAT),
 165        ['0' ... '9'] = IN_DIGITS,
 166    },
 167
 168    [IN_DIGIT] = {
 169        ['0' ... '9'] = IN_DIGITS,
 170    },
 171
 172    [IN_EXP_E] = {
 173        ['-'] = IN_DIGIT,
 174        ['+'] = IN_DIGIT,
 175        ['0' ... '9'] = IN_DIGITS,
 176    },
 177
 178    [IN_MANTISSA_DIGITS] = {
 179        TERMINAL(JSON_FLOAT),
 180        ['0' ... '9'] = IN_MANTISSA_DIGITS,
 181        ['e'] = IN_EXP_E,
 182        ['E'] = IN_EXP_E,
 183    },
 184
 185    [IN_MANTISSA] = {
 186        ['0' ... '9'] = IN_MANTISSA_DIGITS,
 187    },
 188
 189    /* Number */
 190    [IN_NONZERO_NUMBER] = {
 191        TERMINAL(JSON_INTEGER),
 192        ['0' ... '9'] = IN_NONZERO_NUMBER,
 193        ['e'] = IN_EXP_E,
 194        ['E'] = IN_EXP_E,
 195        ['.'] = IN_MANTISSA,
 196    },
 197
 198    [IN_NEG_NONZERO_NUMBER] = {
 199        ['0'] = IN_ZERO,
 200        ['1' ... '9'] = IN_NONZERO_NUMBER,
 201    },
 202
 203    /* keywords */
 204    [IN_KEYWORD] = {
 205        TERMINAL(JSON_KEYWORD),
 206        ['a' ... 'z'] = IN_KEYWORD,
 207    },
 208
 209    /* whitespace */
 210    [IN_WHITESPACE] = {
 211        TERMINAL(JSON_SKIP),
 212        [' '] = IN_WHITESPACE,
 213        ['\t'] = IN_WHITESPACE,
 214        ['\r'] = IN_WHITESPACE,
 215        ['\n'] = IN_WHITESPACE,
 216    },        
 217
 218    /* escape */
 219    [IN_ESCAPE_LL] = {
 220        ['d'] = JSON_ESCAPE,
 221    },
 222
 223    [IN_ESCAPE_L] = {
 224        ['d'] = JSON_ESCAPE,
 225        ['l'] = IN_ESCAPE_LL,
 226    },
 227
 228    [IN_ESCAPE_I64] = {
 229        ['d'] = JSON_ESCAPE,
 230    },
 231
 232    [IN_ESCAPE_I6] = {
 233        ['4'] = IN_ESCAPE_I64,
 234    },
 235
 236    [IN_ESCAPE_I] = {
 237        ['6'] = IN_ESCAPE_I6,
 238    },
 239
 240    [IN_ESCAPE] = {
 241        ['d'] = JSON_ESCAPE,
 242        ['i'] = JSON_ESCAPE,
 243        ['p'] = JSON_ESCAPE,
 244        ['s'] = JSON_ESCAPE,
 245        ['f'] = JSON_ESCAPE,
 246        ['l'] = IN_ESCAPE_L,
 247        ['I'] = IN_ESCAPE_I,
 248    },
 249
 250    /* top level rule */
 251    [IN_START] = {
 252        ['"'] = IN_DQ_STRING,
 253        ['\''] = IN_SQ_STRING,
 254        ['0'] = IN_ZERO,
 255        ['1' ... '9'] = IN_NONZERO_NUMBER,
 256        ['-'] = IN_NEG_NONZERO_NUMBER,
 257        ['{'] = JSON_LCURLY,
 258        ['}'] = JSON_RCURLY,
 259        ['['] = JSON_LSQUARE,
 260        [']'] = JSON_RSQUARE,
 261        [','] = JSON_COMMA,
 262        [':'] = JSON_COLON,
 263        ['a' ... 'z'] = IN_KEYWORD,
 264        ['%'] = IN_ESCAPE,
 265        [' '] = IN_WHITESPACE,
 266        ['\t'] = IN_WHITESPACE,
 267        ['\r'] = IN_WHITESPACE,
 268        ['\n'] = IN_WHITESPACE,
 269    },
 270};
 271
 272void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func)
 273{
 274    lexer->emit = func;
 275    lexer->state = IN_START;
 276    lexer->token = g_string_sized_new(3);
 277    lexer->x = lexer->y = 0;
 278}
 279
 280static int json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush)
 281{
 282    int char_consumed, new_state;
 283
 284    lexer->x++;
 285    if (ch == '\n') {
 286        lexer->x = 0;
 287        lexer->y++;
 288    }
 289
 290    do {
 291        assert(lexer->state <= ARRAY_SIZE(json_lexer));
 292        new_state = json_lexer[lexer->state][(uint8_t)ch];
 293        char_consumed = !TERMINAL_NEEDED_LOOKAHEAD(lexer->state, new_state);
 294        if (char_consumed) {
 295            g_string_append_c(lexer->token, ch);
 296        }
 297
 298        switch (new_state) {
 299        case JSON_LCURLY:
 300        case JSON_RCURLY:
 301        case JSON_LSQUARE:
 302        case JSON_RSQUARE:
 303        case JSON_COLON:
 304        case JSON_COMMA:
 305        case JSON_ESCAPE:
 306        case JSON_INTEGER:
 307        case JSON_FLOAT:
 308        case JSON_KEYWORD:
 309        case JSON_STRING:
 310            lexer->emit(lexer, lexer->token, new_state, lexer->x, lexer->y);
 311            /* fall through */
 312        case JSON_SKIP:
 313            g_string_truncate(lexer->token, 0);
 314            new_state = IN_START;
 315            break;
 316        case IN_ERROR:
 317            /* XXX: To avoid having previous bad input leaving the parser in an
 318             * unresponsive state where we consume unpredictable amounts of
 319             * subsequent "good" input, percolate this error state up to the
 320             * tokenizer/parser by forcing a NULL object to be emitted, then
 321             * reset state.
 322             *
 323             * Also note that this handling is required for reliable channel
 324             * negotiation between QMP and the guest agent, since chr(0xFF)
 325             * is placed at the beginning of certain events to ensure proper
 326             * delivery when the channel is in an unknown state. chr(0xFF) is
 327             * never a valid ASCII/UTF-8 sequence, so this should reliably
 328             * induce an error/flush state.
 329             */
 330            lexer->emit(lexer, lexer->token, JSON_ERROR, lexer->x, lexer->y);
 331            g_string_truncate(lexer->token, 0);
 332            new_state = IN_START;
 333            lexer->state = new_state;
 334            return 0;
 335        default:
 336            break;
 337        }
 338        lexer->state = new_state;
 339    } while (!char_consumed && !flush);
 340
 341    /* Do not let a single token grow to an arbitrarily large size,
 342     * this is a security consideration.
 343     */
 344    if (lexer->token->len > MAX_TOKEN_SIZE) {
 345        lexer->emit(lexer, lexer->token, lexer->state, lexer->x, lexer->y);
 346        g_string_truncate(lexer->token, 0);
 347        lexer->state = IN_START;
 348    }
 349
 350    return 0;
 351}
 352
 353int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size)
 354{
 355    size_t i;
 356
 357    for (i = 0; i < size; i++) {
 358        int err;
 359
 360        err = json_lexer_feed_char(lexer, buffer[i], false);
 361        if (err < 0) {
 362            return err;
 363        }
 364    }
 365
 366    return 0;
 367}
 368
 369int json_lexer_flush(JSONLexer *lexer)
 370{
 371    return lexer->state == IN_START ? 0 : json_lexer_feed_char(lexer, 0, true);
 372}
 373
 374void json_lexer_destroy(JSONLexer *lexer)
 375{
 376    g_string_free(lexer->token, true);
 377}
 378