qemu/qobject/json-lexer.c
<<
>>
Prefs
   1/*
   2 * JSON lexer
   3 *
   4 * Copyright IBM, Corp. 2009
   5 *
   6 * Authors:
   7 *  Anthony Liguori   <aliguori@us.ibm.com>
   8 *
   9 * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
  10 * See the COPYING.LIB file in the top-level directory.
  11 *
  12 */
  13
  14#include "qemu/osdep.h"
  15#include "qemu-common.h"
  16#include "qapi/qmp/json-lexer.h"
  17
  18#define MAX_TOKEN_SIZE (64ULL << 20)
  19
  20/*
  21 * Required by JSON (RFC 7159):
  22 *
  23 * \"([^\\\"]|\\[\"'\\/bfnrt]|\\u[0-9a-fA-F]{4})*\"
  24 * -?(0|[1-9][0-9]*)(.[0-9]+)?([eE][-+]?[0-9]+)?
  25 * [{}\[\],:]
  26 * [a-z]+   # covers null, true, false
  27 *
  28 * Extension of '' strings:
  29 *
  30 * '([^\\']|\\[\"'\\/bfnrt]|\\u[0-9a-fA-F]{4})*'
  31 *
  32 * Extension for vararg handling in JSON construction:
  33 *
  34 * %((l|ll|I64)?d|[ipsf])
  35 *
  36 */
  37
  38enum json_lexer_state {
  39    IN_ERROR = 0,               /* must really be 0, see json_lexer[] */
  40    IN_DQ_UCODE3,
  41    IN_DQ_UCODE2,
  42    IN_DQ_UCODE1,
  43    IN_DQ_UCODE0,
  44    IN_DQ_STRING_ESCAPE,
  45    IN_DQ_STRING,
  46    IN_SQ_UCODE3,
  47    IN_SQ_UCODE2,
  48    IN_SQ_UCODE1,
  49    IN_SQ_UCODE0,
  50    IN_SQ_STRING_ESCAPE,
  51    IN_SQ_STRING,
  52    IN_ZERO,
  53    IN_DIGITS,
  54    IN_DIGIT,
  55    IN_EXP_E,
  56    IN_MANTISSA,
  57    IN_MANTISSA_DIGITS,
  58    IN_NONZERO_NUMBER,
  59    IN_NEG_NONZERO_NUMBER,
  60    IN_KEYWORD,
  61    IN_ESCAPE,
  62    IN_ESCAPE_L,
  63    IN_ESCAPE_LL,
  64    IN_ESCAPE_I,
  65    IN_ESCAPE_I6,
  66    IN_ESCAPE_I64,
  67    IN_WHITESPACE,
  68    IN_START,
  69};
  70
  71QEMU_BUILD_BUG_ON((int)JSON_MIN <= (int)IN_START);
  72
  73#define TERMINAL(state) [0 ... 0x7F] = (state)
  74
  75/* Return whether TERMINAL is a terminal state and the transition to it
  76   from OLD_STATE required lookahead.  This happens whenever the table
  77   below uses the TERMINAL macro.  */
  78#define TERMINAL_NEEDED_LOOKAHEAD(old_state, terminal) \
  79            (json_lexer[(old_state)][0] == (terminal))
  80
  81static const uint8_t json_lexer[][256] =  {
  82    /* Relies on default initialization to IN_ERROR! */
  83
  84    /* double quote string */
  85    [IN_DQ_UCODE3] = {
  86        ['0' ... '9'] = IN_DQ_STRING,
  87        ['a' ... 'f'] = IN_DQ_STRING,
  88        ['A' ... 'F'] = IN_DQ_STRING,
  89    },
  90    [IN_DQ_UCODE2] = {
  91        ['0' ... '9'] = IN_DQ_UCODE3,
  92        ['a' ... 'f'] = IN_DQ_UCODE3,
  93        ['A' ... 'F'] = IN_DQ_UCODE3,
  94    },
  95    [IN_DQ_UCODE1] = {
  96        ['0' ... '9'] = IN_DQ_UCODE2,
  97        ['a' ... 'f'] = IN_DQ_UCODE2,
  98        ['A' ... 'F'] = IN_DQ_UCODE2,
  99    },
 100    [IN_DQ_UCODE0] = {
 101        ['0' ... '9'] = IN_DQ_UCODE1,
 102        ['a' ... 'f'] = IN_DQ_UCODE1,
 103        ['A' ... 'F'] = IN_DQ_UCODE1,
 104    },
 105    [IN_DQ_STRING_ESCAPE] = {
 106        ['b'] = IN_DQ_STRING,
 107        ['f'] =  IN_DQ_STRING,
 108        ['n'] =  IN_DQ_STRING,
 109        ['r'] =  IN_DQ_STRING,
 110        ['t'] =  IN_DQ_STRING,
 111        ['/'] = IN_DQ_STRING,
 112        ['\\'] = IN_DQ_STRING,
 113        ['\''] = IN_DQ_STRING,
 114        ['\"'] = IN_DQ_STRING,
 115        ['u'] = IN_DQ_UCODE0,
 116    },
 117    [IN_DQ_STRING] = {
 118        [1 ... 0xBF] = IN_DQ_STRING,
 119        [0xC2 ... 0xF4] = IN_DQ_STRING,
 120        ['\\'] = IN_DQ_STRING_ESCAPE,
 121        ['"'] = JSON_STRING,
 122    },
 123
 124    /* single quote string */
 125    [IN_SQ_UCODE3] = {
 126        ['0' ... '9'] = IN_SQ_STRING,
 127        ['a' ... 'f'] = IN_SQ_STRING,
 128        ['A' ... 'F'] = IN_SQ_STRING,
 129    },
 130    [IN_SQ_UCODE2] = {
 131        ['0' ... '9'] = IN_SQ_UCODE3,
 132        ['a' ... 'f'] = IN_SQ_UCODE3,
 133        ['A' ... 'F'] = IN_SQ_UCODE3,
 134    },
 135    [IN_SQ_UCODE1] = {
 136        ['0' ... '9'] = IN_SQ_UCODE2,
 137        ['a' ... 'f'] = IN_SQ_UCODE2,
 138        ['A' ... 'F'] = IN_SQ_UCODE2,
 139    },
 140    [IN_SQ_UCODE0] = {
 141        ['0' ... '9'] = IN_SQ_UCODE1,
 142        ['a' ... 'f'] = IN_SQ_UCODE1,
 143        ['A' ... 'F'] = IN_SQ_UCODE1,
 144    },
 145    [IN_SQ_STRING_ESCAPE] = {
 146        ['b'] = IN_SQ_STRING,
 147        ['f'] =  IN_SQ_STRING,
 148        ['n'] =  IN_SQ_STRING,
 149        ['r'] =  IN_SQ_STRING,
 150        ['t'] =  IN_SQ_STRING,
 151        ['/'] = IN_SQ_STRING,
 152        ['\\'] = IN_SQ_STRING,
 153        ['\''] = IN_SQ_STRING,
 154        ['\"'] = IN_SQ_STRING,
 155        ['u'] = IN_SQ_UCODE0,
 156    },
 157    [IN_SQ_STRING] = {
 158        [1 ... 0xBF] = IN_SQ_STRING,
 159        [0xC2 ... 0xF4] = IN_SQ_STRING,
 160        ['\\'] = IN_SQ_STRING_ESCAPE,
 161        ['\''] = JSON_STRING,
 162    },
 163
 164    /* Zero */
 165    [IN_ZERO] = {
 166        TERMINAL(JSON_INTEGER),
 167        ['0' ... '9'] = IN_ERROR,
 168        ['.'] = IN_MANTISSA,
 169    },
 170
 171    /* Float */
 172    [IN_DIGITS] = {
 173        TERMINAL(JSON_FLOAT),
 174        ['0' ... '9'] = IN_DIGITS,
 175    },
 176
 177    [IN_DIGIT] = {
 178        ['0' ... '9'] = IN_DIGITS,
 179    },
 180
 181    [IN_EXP_E] = {
 182        ['-'] = IN_DIGIT,
 183        ['+'] = IN_DIGIT,
 184        ['0' ... '9'] = IN_DIGITS,
 185    },
 186
 187    [IN_MANTISSA_DIGITS] = {
 188        TERMINAL(JSON_FLOAT),
 189        ['0' ... '9'] = IN_MANTISSA_DIGITS,
 190        ['e'] = IN_EXP_E,
 191        ['E'] = IN_EXP_E,
 192    },
 193
 194    [IN_MANTISSA] = {
 195        ['0' ... '9'] = IN_MANTISSA_DIGITS,
 196    },
 197
 198    /* Number */
 199    [IN_NONZERO_NUMBER] = {
 200        TERMINAL(JSON_INTEGER),
 201        ['0' ... '9'] = IN_NONZERO_NUMBER,
 202        ['e'] = IN_EXP_E,
 203        ['E'] = IN_EXP_E,
 204        ['.'] = IN_MANTISSA,
 205    },
 206
 207    [IN_NEG_NONZERO_NUMBER] = {
 208        ['0'] = IN_ZERO,
 209        ['1' ... '9'] = IN_NONZERO_NUMBER,
 210    },
 211
 212    /* keywords */
 213    [IN_KEYWORD] = {
 214        TERMINAL(JSON_KEYWORD),
 215        ['a' ... 'z'] = IN_KEYWORD,
 216    },
 217
 218    /* whitespace */
 219    [IN_WHITESPACE] = {
 220        TERMINAL(JSON_SKIP),
 221        [' '] = IN_WHITESPACE,
 222        ['\t'] = IN_WHITESPACE,
 223        ['\r'] = IN_WHITESPACE,
 224        ['\n'] = IN_WHITESPACE,
 225    },
 226
 227    /* escape */
 228    [IN_ESCAPE_LL] = {
 229        ['d'] = JSON_ESCAPE,
 230        ['u'] = JSON_ESCAPE,
 231    },
 232
 233    [IN_ESCAPE_L] = {
 234        ['d'] = JSON_ESCAPE,
 235        ['l'] = IN_ESCAPE_LL,
 236        ['u'] = JSON_ESCAPE,
 237    },
 238
 239    [IN_ESCAPE_I64] = {
 240        ['d'] = JSON_ESCAPE,
 241        ['u'] = JSON_ESCAPE,
 242    },
 243
 244    [IN_ESCAPE_I6] = {
 245        ['4'] = IN_ESCAPE_I64,
 246    },
 247
 248    [IN_ESCAPE_I] = {
 249        ['6'] = IN_ESCAPE_I6,
 250    },
 251
 252    [IN_ESCAPE] = {
 253        ['d'] = JSON_ESCAPE,
 254        ['i'] = JSON_ESCAPE,
 255        ['p'] = JSON_ESCAPE,
 256        ['s'] = JSON_ESCAPE,
 257        ['u'] = JSON_ESCAPE,
 258        ['f'] = JSON_ESCAPE,
 259        ['l'] = IN_ESCAPE_L,
 260        ['I'] = IN_ESCAPE_I,
 261    },
 262
 263    /* top level rule */
 264    [IN_START] = {
 265        ['"'] = IN_DQ_STRING,
 266        ['\''] = IN_SQ_STRING,
 267        ['0'] = IN_ZERO,
 268        ['1' ... '9'] = IN_NONZERO_NUMBER,
 269        ['-'] = IN_NEG_NONZERO_NUMBER,
 270        ['{'] = JSON_LCURLY,
 271        ['}'] = JSON_RCURLY,
 272        ['['] = JSON_LSQUARE,
 273        [']'] = JSON_RSQUARE,
 274        [','] = JSON_COMMA,
 275        [':'] = JSON_COLON,
 276        ['a' ... 'z'] = IN_KEYWORD,
 277        ['%'] = IN_ESCAPE,
 278        [' '] = IN_WHITESPACE,
 279        ['\t'] = IN_WHITESPACE,
 280        ['\r'] = IN_WHITESPACE,
 281        ['\n'] = IN_WHITESPACE,
 282    },
 283};
 284
 285void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func)
 286{
 287    lexer->emit = func;
 288    lexer->state = IN_START;
 289    lexer->token = g_string_sized_new(3);
 290    lexer->x = lexer->y = 0;
 291}
 292
 293static int json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush)
 294{
 295    int char_consumed, new_state;
 296
 297    lexer->x++;
 298    if (ch == '\n') {
 299        lexer->x = 0;
 300        lexer->y++;
 301    }
 302
 303    do {
 304        assert(lexer->state <= ARRAY_SIZE(json_lexer));
 305        new_state = json_lexer[lexer->state][(uint8_t)ch];
 306        char_consumed = !TERMINAL_NEEDED_LOOKAHEAD(lexer->state, new_state);
 307        if (char_consumed) {
 308            g_string_append_c(lexer->token, ch);
 309        }
 310
 311        switch (new_state) {
 312        case JSON_LCURLY:
 313        case JSON_RCURLY:
 314        case JSON_LSQUARE:
 315        case JSON_RSQUARE:
 316        case JSON_COLON:
 317        case JSON_COMMA:
 318        case JSON_ESCAPE:
 319        case JSON_INTEGER:
 320        case JSON_FLOAT:
 321        case JSON_KEYWORD:
 322        case JSON_STRING:
 323            lexer->emit(lexer, lexer->token, new_state, lexer->x, lexer->y);
 324            /* fall through */
 325        case JSON_SKIP:
 326            g_string_truncate(lexer->token, 0);
 327            new_state = IN_START;
 328            break;
 329        case IN_ERROR:
 330            /* XXX: To avoid having previous bad input leaving the parser in an
 331             * unresponsive state where we consume unpredictable amounts of
 332             * subsequent "good" input, percolate this error state up to the
 333             * tokenizer/parser by forcing a NULL object to be emitted, then
 334             * reset state.
 335             *
 336             * Also note that this handling is required for reliable channel
 337             * negotiation between QMP and the guest agent, since chr(0xFF)
 338             * is placed at the beginning of certain events to ensure proper
 339             * delivery when the channel is in an unknown state. chr(0xFF) is
 340             * never a valid ASCII/UTF-8 sequence, so this should reliably
 341             * induce an error/flush state.
 342             */
 343            lexer->emit(lexer, lexer->token, JSON_ERROR, lexer->x, lexer->y);
 344            g_string_truncate(lexer->token, 0);
 345            new_state = IN_START;
 346            lexer->state = new_state;
 347            return 0;
 348        default:
 349            break;
 350        }
 351        lexer->state = new_state;
 352    } while (!char_consumed && !flush);
 353
 354    /* Do not let a single token grow to an arbitrarily large size,
 355     * this is a security consideration.
 356     */
 357    if (lexer->token->len > MAX_TOKEN_SIZE) {
 358        lexer->emit(lexer, lexer->token, lexer->state, lexer->x, lexer->y);
 359        g_string_truncate(lexer->token, 0);
 360        lexer->state = IN_START;
 361    }
 362
 363    return 0;
 364}
 365
 366int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size)
 367{
 368    size_t i;
 369
 370    for (i = 0; i < size; i++) {
 371        int err;
 372
 373        err = json_lexer_feed_char(lexer, buffer[i], false);
 374        if (err < 0) {
 375            return err;
 376        }
 377    }
 378
 379    return 0;
 380}
 381
 382int json_lexer_flush(JSONLexer *lexer)
 383{
 384    return lexer->state == IN_START ? 0 : json_lexer_feed_char(lexer, 0, true);
 385}
 386
 387void json_lexer_destroy(JSONLexer *lexer)
 388{
 389    g_string_free(lexer->token, true);
 390}
 391