qemu/qobject/json-lexer.c
<<
>>
Prefs
   1/*
   2 * JSON lexer
   3 *
   4 * Copyright IBM, Corp. 2009
   5 *
   6 * Authors:
   7 *  Anthony Liguori   <aliguori@us.ibm.com>
   8 *
   9 * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
  10 * See the COPYING.LIB file in the top-level directory.
  11 *
  12 */
  13
  14#include "qemu/osdep.h"
  15#include "qemu-common.h"
  16#include "qapi/qmp/json-lexer.h"
  17
  18#define MAX_TOKEN_SIZE (64ULL << 20)
  19
  20/*
  21 * Required by JSON (RFC 7159):
  22 *
  23 * \"([^\\\"]|\\[\"'\\/bfnrt]|\\u[0-9a-fA-F]{4})*\"
  24 * -?(0|[1-9][0-9]*)(.[0-9]+)?([eE][-+]?[0-9]+)?
  25 * [{}\[\],:]
  26 * [a-z]+   # covers null, true, false
  27 *
  28 * Extension of '' strings:
  29 *
  30 * '([^\\']|\\[\"'\\/bfnrt]|\\u[0-9a-fA-F]{4})*'
  31 *
  32 * Extension for vararg handling in JSON construction:
  33 *
  34 * %((l|ll|I64)?d|[ipsf])
  35 *
  36 */
  37
  38enum json_lexer_state {
  39    IN_ERROR = 0,               /* must really be 0, see json_lexer[] */
  40    IN_DQ_UCODE3,
  41    IN_DQ_UCODE2,
  42    IN_DQ_UCODE1,
  43    IN_DQ_UCODE0,
  44    IN_DQ_STRING_ESCAPE,
  45    IN_DQ_STRING,
  46    IN_SQ_UCODE3,
  47    IN_SQ_UCODE2,
  48    IN_SQ_UCODE1,
  49    IN_SQ_UCODE0,
  50    IN_SQ_STRING_ESCAPE,
  51    IN_SQ_STRING,
  52    IN_ZERO,
  53    IN_DIGITS,
  54    IN_DIGIT,
  55    IN_EXP_E,
  56    IN_MANTISSA,
  57    IN_MANTISSA_DIGITS,
  58    IN_NONZERO_NUMBER,
  59    IN_NEG_NONZERO_NUMBER,
  60    IN_KEYWORD,
  61    IN_ESCAPE,
  62    IN_ESCAPE_L,
  63    IN_ESCAPE_LL,
  64    IN_ESCAPE_I,
  65    IN_ESCAPE_I6,
  66    IN_ESCAPE_I64,
  67    IN_WHITESPACE,
  68    IN_START,
  69};
  70
  71QEMU_BUILD_BUG_ON((int)JSON_MIN <= (int)IN_START);
  72
  73#define TERMINAL(state) [0 ... 0x7F] = (state)
  74
  75/* Return whether TERMINAL is a terminal state and the transition to it
  76   from OLD_STATE required lookahead.  This happens whenever the table
  77   below uses the TERMINAL macro.  */
  78#define TERMINAL_NEEDED_LOOKAHEAD(old_state, terminal) \
  79            (json_lexer[(old_state)][0] == (terminal))
  80
  81static const uint8_t json_lexer[][256] =  {
  82    /* Relies on default initialization to IN_ERROR! */
  83
  84    /* double quote string */
  85    [IN_DQ_UCODE3] = {
  86        ['0' ... '9'] = IN_DQ_STRING,
  87        ['a' ... 'f'] = IN_DQ_STRING,
  88        ['A' ... 'F'] = IN_DQ_STRING,
  89    },
  90    [IN_DQ_UCODE2] = {
  91        ['0' ... '9'] = IN_DQ_UCODE3,
  92        ['a' ... 'f'] = IN_DQ_UCODE3,
  93        ['A' ... 'F'] = IN_DQ_UCODE3,
  94    },
  95    [IN_DQ_UCODE1] = {
  96        ['0' ... '9'] = IN_DQ_UCODE2,
  97        ['a' ... 'f'] = IN_DQ_UCODE2,
  98        ['A' ... 'F'] = IN_DQ_UCODE2,
  99    },
 100    [IN_DQ_UCODE0] = {
 101        ['0' ... '9'] = IN_DQ_UCODE1,
 102        ['a' ... 'f'] = IN_DQ_UCODE1,
 103        ['A' ... 'F'] = IN_DQ_UCODE1,
 104    },
 105    [IN_DQ_STRING_ESCAPE] = {
 106        ['b'] = IN_DQ_STRING,
 107        ['f'] =  IN_DQ_STRING,
 108        ['n'] =  IN_DQ_STRING,
 109        ['r'] =  IN_DQ_STRING,
 110        ['t'] =  IN_DQ_STRING,
 111        ['/'] = IN_DQ_STRING,
 112        ['\\'] = IN_DQ_STRING,
 113        ['\''] = IN_DQ_STRING,
 114        ['\"'] = IN_DQ_STRING,
 115        ['u'] = IN_DQ_UCODE0,
 116    },
 117    [IN_DQ_STRING] = {
 118        [1 ... 0xBF] = IN_DQ_STRING,
 119        [0xC2 ... 0xF4] = IN_DQ_STRING,
 120        ['\\'] = IN_DQ_STRING_ESCAPE,
 121        ['"'] = JSON_STRING,
 122    },
 123
 124    /* single quote string */
 125    [IN_SQ_UCODE3] = {
 126        ['0' ... '9'] = IN_SQ_STRING,
 127        ['a' ... 'f'] = IN_SQ_STRING,
 128        ['A' ... 'F'] = IN_SQ_STRING,
 129    },
 130    [IN_SQ_UCODE2] = {
 131        ['0' ... '9'] = IN_SQ_UCODE3,
 132        ['a' ... 'f'] = IN_SQ_UCODE3,
 133        ['A' ... 'F'] = IN_SQ_UCODE3,
 134    },
 135    [IN_SQ_UCODE1] = {
 136        ['0' ... '9'] = IN_SQ_UCODE2,
 137        ['a' ... 'f'] = IN_SQ_UCODE2,
 138        ['A' ... 'F'] = IN_SQ_UCODE2,
 139    },
 140    [IN_SQ_UCODE0] = {
 141        ['0' ... '9'] = IN_SQ_UCODE1,
 142        ['a' ... 'f'] = IN_SQ_UCODE1,
 143        ['A' ... 'F'] = IN_SQ_UCODE1,
 144    },
 145    [IN_SQ_STRING_ESCAPE] = {
 146        ['b'] = IN_SQ_STRING,
 147        ['f'] =  IN_SQ_STRING,
 148        ['n'] =  IN_SQ_STRING,
 149        ['r'] =  IN_SQ_STRING,
 150        ['t'] =  IN_SQ_STRING,
 151        ['/'] = IN_SQ_STRING,
 152        ['\\'] = IN_SQ_STRING,
 153        ['\''] = IN_SQ_STRING,
 154        ['\"'] = IN_SQ_STRING,
 155        ['u'] = IN_SQ_UCODE0,
 156    },
 157    [IN_SQ_STRING] = {
 158        [1 ... 0xBF] = IN_SQ_STRING,
 159        [0xC2 ... 0xF4] = IN_SQ_STRING,
 160        ['\\'] = IN_SQ_STRING_ESCAPE,
 161        ['\''] = JSON_STRING,
 162    },
 163
 164    /* Zero */
 165    [IN_ZERO] = {
 166        TERMINAL(JSON_INTEGER),
 167        ['0' ... '9'] = IN_ERROR,
 168        ['.'] = IN_MANTISSA,
 169    },
 170
 171    /* Float */
 172    [IN_DIGITS] = {
 173        TERMINAL(JSON_FLOAT),
 174        ['0' ... '9'] = IN_DIGITS,
 175    },
 176
 177    [IN_DIGIT] = {
 178        ['0' ... '9'] = IN_DIGITS,
 179    },
 180
 181    [IN_EXP_E] = {
 182        ['-'] = IN_DIGIT,
 183        ['+'] = IN_DIGIT,
 184        ['0' ... '9'] = IN_DIGITS,
 185    },
 186
 187    [IN_MANTISSA_DIGITS] = {
 188        TERMINAL(JSON_FLOAT),
 189        ['0' ... '9'] = IN_MANTISSA_DIGITS,
 190        ['e'] = IN_EXP_E,
 191        ['E'] = IN_EXP_E,
 192    },
 193
 194    [IN_MANTISSA] = {
 195        ['0' ... '9'] = IN_MANTISSA_DIGITS,
 196    },
 197
 198    /* Number */
 199    [IN_NONZERO_NUMBER] = {
 200        TERMINAL(JSON_INTEGER),
 201        ['0' ... '9'] = IN_NONZERO_NUMBER,
 202        ['e'] = IN_EXP_E,
 203        ['E'] = IN_EXP_E,
 204        ['.'] = IN_MANTISSA,
 205    },
 206
 207    [IN_NEG_NONZERO_NUMBER] = {
 208        ['0'] = IN_ZERO,
 209        ['1' ... '9'] = IN_NONZERO_NUMBER,
 210    },
 211
 212    /* keywords */
 213    [IN_KEYWORD] = {
 214        TERMINAL(JSON_KEYWORD),
 215        ['a' ... 'z'] = IN_KEYWORD,
 216    },
 217
 218    /* whitespace */
 219    [IN_WHITESPACE] = {
 220        TERMINAL(JSON_SKIP),
 221        [' '] = IN_WHITESPACE,
 222        ['\t'] = IN_WHITESPACE,
 223        ['\r'] = IN_WHITESPACE,
 224        ['\n'] = IN_WHITESPACE,
 225    },
 226
 227    /* escape */
 228    [IN_ESCAPE_LL] = {
 229        ['d'] = JSON_ESCAPE,
 230    },
 231
 232    [IN_ESCAPE_L] = {
 233        ['d'] = JSON_ESCAPE,
 234        ['l'] = IN_ESCAPE_LL,
 235    },
 236
 237    [IN_ESCAPE_I64] = {
 238        ['d'] = JSON_ESCAPE,
 239    },
 240
 241    [IN_ESCAPE_I6] = {
 242        ['4'] = IN_ESCAPE_I64,
 243    },
 244
 245    [IN_ESCAPE_I] = {
 246        ['6'] = IN_ESCAPE_I6,
 247    },
 248
 249    [IN_ESCAPE] = {
 250        ['d'] = JSON_ESCAPE,
 251        ['i'] = JSON_ESCAPE,
 252        ['p'] = JSON_ESCAPE,
 253        ['s'] = JSON_ESCAPE,
 254        ['f'] = JSON_ESCAPE,
 255        ['l'] = IN_ESCAPE_L,
 256        ['I'] = IN_ESCAPE_I,
 257    },
 258
 259    /* top level rule */
 260    [IN_START] = {
 261        ['"'] = IN_DQ_STRING,
 262        ['\''] = IN_SQ_STRING,
 263        ['0'] = IN_ZERO,
 264        ['1' ... '9'] = IN_NONZERO_NUMBER,
 265        ['-'] = IN_NEG_NONZERO_NUMBER,
 266        ['{'] = JSON_LCURLY,
 267        ['}'] = JSON_RCURLY,
 268        ['['] = JSON_LSQUARE,
 269        [']'] = JSON_RSQUARE,
 270        [','] = JSON_COMMA,
 271        [':'] = JSON_COLON,
 272        ['a' ... 'z'] = IN_KEYWORD,
 273        ['%'] = IN_ESCAPE,
 274        [' '] = IN_WHITESPACE,
 275        ['\t'] = IN_WHITESPACE,
 276        ['\r'] = IN_WHITESPACE,
 277        ['\n'] = IN_WHITESPACE,
 278    },
 279};
 280
 281void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func)
 282{
 283    lexer->emit = func;
 284    lexer->state = IN_START;
 285    lexer->token = g_string_sized_new(3);
 286    lexer->x = lexer->y = 0;
 287}
 288
 289static int json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush)
 290{
 291    int char_consumed, new_state;
 292
 293    lexer->x++;
 294    if (ch == '\n') {
 295        lexer->x = 0;
 296        lexer->y++;
 297    }
 298
 299    do {
 300        assert(lexer->state <= ARRAY_SIZE(json_lexer));
 301        new_state = json_lexer[lexer->state][(uint8_t)ch];
 302        char_consumed = !TERMINAL_NEEDED_LOOKAHEAD(lexer->state, new_state);
 303        if (char_consumed) {
 304            g_string_append_c(lexer->token, ch);
 305        }
 306
 307        switch (new_state) {
 308        case JSON_LCURLY:
 309        case JSON_RCURLY:
 310        case JSON_LSQUARE:
 311        case JSON_RSQUARE:
 312        case JSON_COLON:
 313        case JSON_COMMA:
 314        case JSON_ESCAPE:
 315        case JSON_INTEGER:
 316        case JSON_FLOAT:
 317        case JSON_KEYWORD:
 318        case JSON_STRING:
 319            lexer->emit(lexer, lexer->token, new_state, lexer->x, lexer->y);
 320            /* fall through */
 321        case JSON_SKIP:
 322            g_string_truncate(lexer->token, 0);
 323            new_state = IN_START;
 324            break;
 325        case IN_ERROR:
 326            /* XXX: To avoid having previous bad input leaving the parser in an
 327             * unresponsive state where we consume unpredictable amounts of
 328             * subsequent "good" input, percolate this error state up to the
 329             * tokenizer/parser by forcing a NULL object to be emitted, then
 330             * reset state.
 331             *
 332             * Also note that this handling is required for reliable channel
 333             * negotiation between QMP and the guest agent, since chr(0xFF)
 334             * is placed at the beginning of certain events to ensure proper
 335             * delivery when the channel is in an unknown state. chr(0xFF) is
 336             * never a valid ASCII/UTF-8 sequence, so this should reliably
 337             * induce an error/flush state.
 338             */
 339            lexer->emit(lexer, lexer->token, JSON_ERROR, lexer->x, lexer->y);
 340            g_string_truncate(lexer->token, 0);
 341            new_state = IN_START;
 342            lexer->state = new_state;
 343            return 0;
 344        default:
 345            break;
 346        }
 347        lexer->state = new_state;
 348    } while (!char_consumed && !flush);
 349
 350    /* Do not let a single token grow to an arbitrarily large size,
 351     * this is a security consideration.
 352     */
 353    if (lexer->token->len > MAX_TOKEN_SIZE) {
 354        lexer->emit(lexer, lexer->token, lexer->state, lexer->x, lexer->y);
 355        g_string_truncate(lexer->token, 0);
 356        lexer->state = IN_START;
 357    }
 358
 359    return 0;
 360}
 361
 362int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size)
 363{
 364    size_t i;
 365
 366    for (i = 0; i < size; i++) {
 367        int err;
 368
 369        err = json_lexer_feed_char(lexer, buffer[i], false);
 370        if (err < 0) {
 371            return err;
 372        }
 373    }
 374
 375    return 0;
 376}
 377
 378int json_lexer_flush(JSONLexer *lexer)
 379{
 380    return lexer->state == IN_START ? 0 : json_lexer_feed_char(lexer, 0, true);
 381}
 382
 383void json_lexer_destroy(JSONLexer *lexer)
 384{
 385    g_string_free(lexer->token, true);
 386}
 387