qemu/hw/i386/kvm/xenstore_impl.c
<<
>>
Prefs
   1/*
   2 * QEMU Xen emulation: The actual implementation of XenStore
   3 *
   4 * Copyright © 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
   5 *
   6 * Authors: David Woodhouse <dwmw2@infradead.org>, Paul Durrant <paul@xen.org>
   7 *
   8 * This work is licensed under the terms of the GNU GPL, version 2 or later.
   9 * See the COPYING file in the top-level directory.
  10 */
  11
  12#include "qemu/osdep.h"
  13#include "qom/object.h"
  14
  15#include "hw/xen/xen.h"
  16
  17#include "xen_xenstore.h"
  18#include "xenstore_impl.h"
  19
  20#include "hw/xen/interface/io/xs_wire.h"
  21
  22#define XS_MAX_WATCHES          128
  23#define XS_MAX_DOMAIN_NODES     1000
  24#define XS_MAX_NODE_SIZE        2048
  25#define XS_MAX_TRANSACTIONS     10
  26#define XS_MAX_PERMS_PER_NODE   5
  27
  28#define XS_VALID_CHARS "abcdefghijklmnopqrstuvwxyz" \
  29                       "ABCDEFGHIJKLMNOPQRSTUVWXYZ" \
  30                       "0123456789-/_"
  31
  32typedef struct XsNode {
  33    uint32_t ref;
  34    GByteArray *content;
  35    GList *perms;
  36    GHashTable *children;
  37    uint64_t gencnt;
  38    bool deleted_in_tx;
  39    bool modified_in_tx;
  40    unsigned int serialized_tx;
  41#ifdef XS_NODE_UNIT_TEST
  42    gchar *name; /* debug only */
  43#endif
  44} XsNode;
  45
  46typedef struct XsWatch {
  47    struct XsWatch *next;
  48    xs_impl_watch_fn *cb;
  49    void *cb_opaque;
  50    char *token;
  51    unsigned int dom_id;
  52    int rel_prefix;
  53} XsWatch;
  54
  55typedef struct XsTransaction {
  56    XsNode *root;
  57    unsigned int nr_nodes;
  58    unsigned int base_tx;
  59    unsigned int tx_id;
  60    unsigned int dom_id;
  61} XsTransaction;
  62
  63struct XenstoreImplState {
  64    XsNode *root;
  65    unsigned int nr_nodes;
  66    GHashTable *watches;
  67    unsigned int nr_domu_watches;
  68    GHashTable *transactions;
  69    unsigned int nr_domu_transactions;
  70    unsigned int root_tx;
  71    unsigned int last_tx;
  72    bool serialized;
  73};
  74
  75
  76static void nobble_tx(gpointer key, gpointer value, gpointer user_data)
  77{
  78    unsigned int *new_tx_id = user_data;
  79    XsTransaction *tx = value;
  80
  81    if (tx->base_tx == *new_tx_id) {
  82        /* Transactions based on XBT_NULL will always fail */
  83        tx->base_tx = XBT_NULL;
  84    }
  85}
  86
  87static inline unsigned int next_tx(struct XenstoreImplState *s)
  88{
  89    unsigned int tx_id;
  90
  91    /* Find the next TX id which isn't either XBT_NULL or in use. */
  92    do {
  93        tx_id = ++s->last_tx;
  94    } while (tx_id == XBT_NULL || tx_id == s->root_tx ||
  95             g_hash_table_lookup(s->transactions, GINT_TO_POINTER(tx_id)));
  96
  97    /*
  98     * It is vanishingly unlikely, but ensure that no outstanding transaction
  99     * is based on the (previous incarnation of the) newly-allocated TX id.
 100     */
 101    g_hash_table_foreach(s->transactions, nobble_tx, &tx_id);
 102
 103    return tx_id;
 104}
 105
 106static inline XsNode *xs_node_new(void)
 107{
 108    XsNode *n = g_new0(XsNode, 1);
 109    n->ref = 1;
 110
 111#ifdef XS_NODE_UNIT_TEST
 112    nr_xs_nodes++;
 113    xs_node_list = g_list_prepend(xs_node_list, n);
 114#endif
 115    return n;
 116}
 117
 118static inline XsNode *xs_node_ref(XsNode *n)
 119{
 120    /* With just 10 transactions, it can never get anywhere near this. */
 121    g_assert(n->ref < INT_MAX);
 122
 123    g_assert(n->ref);
 124    n->ref++;
 125    return n;
 126}
 127
 128static inline void xs_node_unref(XsNode *n)
 129{
 130    if (!n) {
 131        return;
 132    }
 133    g_assert(n->ref);
 134    if (--n->ref) {
 135        return;
 136    }
 137
 138    if (n->content) {
 139        g_byte_array_unref(n->content);
 140    }
 141    if (n->perms) {
 142        g_list_free_full(n->perms, g_free);
 143    }
 144    if (n->children) {
 145        g_hash_table_unref(n->children);
 146    }
 147#ifdef XS_NODE_UNIT_TEST
 148    g_free(n->name);
 149    nr_xs_nodes--;
 150    xs_node_list = g_list_remove(xs_node_list, n);
 151#endif
 152    g_free(n);
 153}
 154
 155char *xs_perm_as_string(unsigned int perm, unsigned int domid)
 156{
 157    char letter;
 158
 159    switch (perm) {
 160    case XS_PERM_READ | XS_PERM_WRITE:
 161        letter = 'b';
 162        break;
 163    case XS_PERM_READ:
 164        letter = 'r';
 165        break;
 166    case XS_PERM_WRITE:
 167        letter = 'w';
 168        break;
 169    case XS_PERM_NONE:
 170    default:
 171        letter = 'n';
 172        break;
 173    }
 174
 175    return g_strdup_printf("%c%u", letter, domid);
 176}
 177
 178static gpointer do_perm_copy(gconstpointer src, gpointer user_data)
 179{
 180    return g_strdup(src);
 181}
 182
 183static XsNode *xs_node_create(const char *name, GList *perms)
 184{
 185    XsNode *n = xs_node_new();
 186
 187#ifdef XS_NODE_UNIT_TEST
 188    if (name) {
 189        n->name = g_strdup(name);
 190    }
 191#endif
 192
 193    n->perms = g_list_copy_deep(perms, do_perm_copy, NULL);
 194
 195    return n;
 196}
 197
 198/* For copying from one hash table to another using g_hash_table_foreach() */
 199static void do_child_insert(gpointer key, gpointer value, gpointer user_data)
 200{
 201    g_hash_table_insert(user_data, g_strdup(key), xs_node_ref(value));
 202}
 203
 204static XsNode *xs_node_copy(XsNode *old)
 205{
 206    XsNode *n = xs_node_new();
 207
 208    n->gencnt = old->gencnt;
 209
 210#ifdef XS_NODE_UNIT_TEST
 211    if (n->name) {
 212        n->name = g_strdup(old->name);
 213    }
 214#endif
 215
 216    assert(old);
 217    if (old->children) {
 218        n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free,
 219                                            (GDestroyNotify)xs_node_unref);
 220        g_hash_table_foreach(old->children, do_child_insert, n->children);
 221    }
 222    if (old->perms) {
 223        n->perms = g_list_copy_deep(old->perms, do_perm_copy, NULL);
 224    }
 225    if (old->content) {
 226        n->content = g_byte_array_ref(old->content);
 227    }
 228    return n;
 229}
 230
 231/* Returns true if it made a change to the hash table */
 232static bool xs_node_add_child(XsNode *n, const char *path_elem, XsNode *child)
 233{
 234    assert(!strchr(path_elem, '/'));
 235
 236    if (!child) {
 237        assert(n->children);
 238        return g_hash_table_remove(n->children, path_elem);
 239    }
 240
 241#ifdef XS_NODE_UNIT_TEST
 242    g_free(child->name);
 243    child->name = g_strdup(path_elem);
 244#endif
 245    if (!n->children) {
 246        n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free,
 247                                            (GDestroyNotify)xs_node_unref);
 248    }
 249
 250    /*
 251     * The documentation for g_hash_table_insert() says that it "returns a
 252     * boolean value to indicate whether the newly added value was already
 253     * in the hash table or not."
 254     *
 255     * It could perhaps be clearer that returning TRUE means it wasn't,
 256     */
 257    return g_hash_table_insert(n->children, g_strdup(path_elem), child);
 258}
 259
 260struct walk_op {
 261    struct XenstoreImplState *s;
 262    char path[XENSTORE_ABS_PATH_MAX + 2]; /* Two NUL terminators */
 263    int (*op_fn)(XsNode **n, struct walk_op *op);
 264    void *op_opaque;
 265    void *op_opaque2;
 266
 267    GList *watches;
 268    unsigned int dom_id;
 269    unsigned int tx_id;
 270
 271    /* The number of nodes which will exist in the tree if this op succeeds. */
 272    unsigned int new_nr_nodes;
 273
 274    /*
 275     * This is maintained on the way *down* the walk to indicate
 276     * whether nodes can be modified in place or whether COW is
 277     * required. It starts off being true, as we're always going to
 278     * replace the root node. If we walk into a shared subtree it
 279     * becomes false. If we start *creating* new nodes for a write,
 280     * it becomes true again.
 281     *
 282     * Do not use it on the way back up.
 283     */
 284    bool inplace;
 285    bool mutating;
 286    bool create_dirs;
 287    bool in_transaction;
 288
 289    /* Tracking during recursion so we know which is first. */
 290    bool deleted_in_tx;
 291};
 292
 293static void fire_watches(struct walk_op *op, bool parents)
 294{
 295    GList *l = NULL;
 296    XsWatch *w;
 297
 298    if (!op->mutating || op->in_transaction) {
 299        return;
 300    }
 301
 302    if (parents) {
 303        l = op->watches;
 304    }
 305
 306    w = g_hash_table_lookup(op->s->watches, op->path);
 307    while (w || l) {
 308        if (!w) {
 309            /* Fire the parent nodes from 'op' if asked to */
 310            w = l->data;
 311            l = l->next;
 312            continue;
 313        }
 314
 315        assert(strlen(op->path) > w->rel_prefix);
 316        w->cb(w->cb_opaque, op->path + w->rel_prefix, w->token);
 317
 318        w = w->next;
 319    }
 320}
 321
 322static int xs_node_add_content(XsNode **n, struct walk_op *op)
 323{
 324    GByteArray *data = op->op_opaque;
 325
 326    if (op->dom_id) {
 327        /*
 328         * The real XenStored includes permissions and names of child nodes
 329         * in the calculated datasize but life's too short. For a single
 330         * tenant internal XenStore, we don't have to be quite as pedantic.
 331         */
 332        if (data->len > XS_MAX_NODE_SIZE) {
 333            return E2BIG;
 334        }
 335    }
 336    /* We *are* the node to be written. Either this or a copy. */
 337    if (!op->inplace) {
 338        XsNode *old = *n;
 339        *n = xs_node_copy(old);
 340        xs_node_unref(old);
 341    }
 342
 343    if ((*n)->content) {
 344        g_byte_array_unref((*n)->content);
 345    }
 346    (*n)->content = g_byte_array_ref(data);
 347    if (op->tx_id != XBT_NULL) {
 348        (*n)->modified_in_tx = true;
 349    }
 350    return 0;
 351}
 352
 353static int xs_node_get_content(XsNode **n, struct walk_op *op)
 354{
 355    GByteArray *data = op->op_opaque;
 356    GByteArray *node_data;
 357
 358    assert(op->inplace);
 359    assert(*n);
 360
 361    node_data = (*n)->content;
 362    if (node_data) {
 363        g_byte_array_append(data, node_data->data, node_data->len);
 364    }
 365
 366    return 0;
 367}
 368
 369static int node_rm_recurse(gpointer key, gpointer value, gpointer user_data)
 370{
 371    struct walk_op *op = user_data;
 372    int path_len = strlen(op->path);
 373    int key_len = strlen(key);
 374    XsNode *n = value;
 375    bool this_inplace = op->inplace;
 376
 377    if (n->ref != 1) {
 378        op->inplace = 0;
 379    }
 380
 381    assert(key_len + path_len + 2 <= sizeof(op->path));
 382    op->path[path_len] = '/';
 383    memcpy(op->path + path_len + 1, key, key_len + 1);
 384
 385    if (n->children) {
 386        g_hash_table_foreach_remove(n->children, node_rm_recurse, op);
 387    }
 388    op->new_nr_nodes--;
 389
 390    /*
 391     * Fire watches on *this* node but not the parents because they are
 392     * going to be deleted too, so the watch will fire for them anyway.
 393     */
 394    fire_watches(op, false);
 395    op->path[path_len] = '\0';
 396
 397    /*
 398     * Actually deleting the child here is just an optimisation; if we
 399     * don't then the final unref on the topmost victim will just have
 400     * to cascade down again repeating all the g_hash_table_foreach()
 401     * calls.
 402     */
 403    return this_inplace;
 404}
 405
 406static XsNode *xs_node_copy_deleted(XsNode *old, struct walk_op *op);
 407static void copy_deleted_recurse(gpointer key, gpointer value,
 408                                 gpointer user_data)
 409{
 410    struct walk_op *op = user_data;
 411    GHashTable *siblings = op->op_opaque2;
 412    XsNode *n = xs_node_copy_deleted(value, op);
 413
 414    /*
 415     * Reinsert the deleted_in_tx copy of the node into the parent's
 416     * 'children' hash table. Having stashed it from op->op_opaque2
 417     * before the recursive call to xs_node_copy_deleted() scribbled
 418     * over it.
 419     */
 420    g_hash_table_insert(siblings, g_strdup(key), n);
 421}
 422
 423static XsNode *xs_node_copy_deleted(XsNode *old, struct walk_op *op)
 424{
 425    XsNode *n = xs_node_new();
 426
 427    n->gencnt = old->gencnt;
 428
 429#ifdef XS_NODE_UNIT_TEST
 430    if (old->name) {
 431        n->name = g_strdup(old->name);
 432    }
 433#endif
 434
 435    if (old->children) {
 436        n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free,
 437                                            (GDestroyNotify)xs_node_unref);
 438        op->op_opaque2 = n->children;
 439        g_hash_table_foreach(old->children, copy_deleted_recurse, op);
 440    }
 441    if (old->perms) {
 442        n->perms = g_list_copy_deep(old->perms, do_perm_copy, NULL);
 443    }
 444    n->deleted_in_tx = true;
 445    /* If it gets resurrected we only fire a watch if it lost its content */
 446    if (old->content) {
 447        n->modified_in_tx = true;
 448    }
 449    op->new_nr_nodes--;
 450    return n;
 451}
 452
 453static int xs_node_rm(XsNode **n, struct walk_op *op)
 454{
 455    bool this_inplace = op->inplace;
 456
 457    if (op->tx_id != XBT_NULL) {
 458        /* It's not trivial to do inplace handling for this one */
 459        XsNode *old = *n;
 460        *n = xs_node_copy_deleted(old, op);
 461        xs_node_unref(old);
 462        return 0;
 463    }
 464
 465    /* Fire watches for, and count, nodes in the subtree which get deleted */
 466    if ((*n)->children) {
 467        g_hash_table_foreach_remove((*n)->children, node_rm_recurse, op);
 468    }
 469    op->new_nr_nodes--;
 470
 471    if (this_inplace) {
 472        xs_node_unref(*n);
 473    }
 474    *n = NULL;
 475    return 0;
 476}
 477
 478static int xs_node_get_perms(XsNode **n, struct walk_op *op)
 479{
 480    GList **perms = op->op_opaque;
 481
 482    assert(op->inplace);
 483    assert(*n);
 484
 485    *perms = g_list_copy_deep((*n)->perms, do_perm_copy, NULL);
 486    return 0;
 487}
 488
 489static void parse_perm(const char *perm, char *letter, unsigned int *dom_id)
 490{
 491    unsigned int n = sscanf(perm, "%c%u", letter, dom_id);
 492
 493    assert(n == 2);
 494}
 495
 496static bool can_access(unsigned int dom_id, GList *perms, const char *letters)
 497{
 498    unsigned int i, n;
 499    char perm_letter;
 500    unsigned int perm_dom_id;
 501    bool access;
 502
 503    if (dom_id == 0) {
 504        return true;
 505    }
 506
 507    n = g_list_length(perms);
 508    assert(n >= 1);
 509
 510    /*
 511     * The dom_id of the first perm is the owner, and the owner always has
 512     * read-write access.
 513     */
 514    parse_perm(g_list_nth_data(perms, 0), &perm_letter, &perm_dom_id);
 515    if (dom_id == perm_dom_id) {
 516        return true;
 517    }
 518
 519    /*
 520     * The letter of the first perm specified the default access for all other
 521     * domains.
 522     */
 523    access = !!strchr(letters, perm_letter);
 524    for (i = 1; i < n; i++) {
 525        parse_perm(g_list_nth_data(perms, i), &perm_letter, &perm_dom_id);
 526        if (dom_id != perm_dom_id) {
 527            continue;
 528        }
 529        access = !!strchr(letters, perm_letter);
 530    }
 531
 532    return access;
 533}
 534
 535static int xs_node_set_perms(XsNode **n, struct walk_op *op)
 536{
 537    GList *perms = op->op_opaque;
 538
 539    if (op->dom_id) {
 540        unsigned int perm_dom_id;
 541        char perm_letter;
 542
 543        /* A guest may not change permissions on nodes it does not own */
 544        if (!can_access(op->dom_id, (*n)->perms, "")) {
 545            return EPERM;
 546        }
 547
 548        /* A guest may not change the owner of a node it owns. */
 549        parse_perm(perms->data, &perm_letter, &perm_dom_id);
 550        if (perm_dom_id != op->dom_id) {
 551            return EPERM;
 552        }
 553
 554        if (g_list_length(perms) > XS_MAX_PERMS_PER_NODE) {
 555            return ENOSPC;
 556        }
 557    }
 558
 559    /* We *are* the node to be written. Either this or a copy. */
 560    if (!op->inplace) {
 561        XsNode *old = *n;
 562        *n = xs_node_copy(old);
 563        xs_node_unref(old);
 564    }
 565
 566    if ((*n)->perms) {
 567        g_list_free_full((*n)->perms, g_free);
 568    }
 569    (*n)->perms = g_list_copy_deep(perms, do_perm_copy, NULL);
 570    if (op->tx_id != XBT_NULL) {
 571        (*n)->modified_in_tx = true;
 572    }
 573    return 0;
 574}
 575
 576/*
 577 * Passed a full reference in *n which it may free if it needs to COW.
 578 *
 579 * When changing the tree, the op->inplace flag indicates whether this
 580 * node may be modified in place (i.e. it and all its parents had a
 581 * refcount of one). If walking down the tree we find a node whose
 582 * refcount is higher, we must clear op->inplace and COW from there
 583 * down. Unless we are creating new nodes as scaffolding for a write
 584 * (which works like 'mkdir -p' does). In which case those newly
 585 * created nodes can (and must) be modified in place again.
 586 */
 587static int xs_node_walk(XsNode **n, struct walk_op *op)
 588{
 589    char *child_name = NULL;
 590    size_t namelen;
 591    XsNode *old = *n, *child = NULL;
 592    bool stole_child = false;
 593    bool this_inplace;
 594    XsWatch *watch;
 595    int err;
 596
 597    namelen = strlen(op->path);
 598    watch = g_hash_table_lookup(op->s->watches, op->path);
 599
 600    /* Is there a child, or do we hit the double-NUL termination? */
 601    if (op->path[namelen + 1]) {
 602        char *slash;
 603        child_name = op->path + namelen + 1;
 604        slash = strchr(child_name, '/');
 605        if (slash) {
 606            *slash = '\0';
 607        }
 608        op->path[namelen] = '/';
 609    }
 610
 611    /* If we walk into a subtree which is shared, we must COW */
 612    if (op->mutating && old->ref != 1) {
 613        op->inplace = false;
 614    }
 615
 616    if (!child_name) {
 617        const char *letters = op->mutating ? "wb" : "rb";
 618
 619        if (!can_access(op->dom_id, old->perms, letters)) {
 620            err = EACCES;
 621            goto out;
 622        }
 623
 624        /* This is the actual node on which the operation shall be performed */
 625        err = op->op_fn(n, op);
 626        if (!err) {
 627            fire_watches(op, true);
 628        }
 629        goto out;
 630    }
 631
 632    /* op->inplace will be further modified during the recursion */
 633    this_inplace = op->inplace;
 634
 635    if (old && old->children) {
 636        child = g_hash_table_lookup(old->children, child_name);
 637        /* This is a *weak* reference to 'child', owned by the hash table */
 638    }
 639
 640    if (child) {
 641        if (child->deleted_in_tx) {
 642            assert(child->ref == 1);
 643            /* Cannot actually set child->deleted_in_tx = false until later */
 644        }
 645        xs_node_ref(child);
 646        /*
 647         * Now we own it too. But if we can modify inplace, that's going to
 648         * foil the check and force it to COW. We want to be the *only* owner
 649         * so that it can be modified in place, so remove it from the hash
 650         * table in that case. We'll add it (or its replacement) back later.
 651         */
 652        if (op->mutating && this_inplace) {
 653            g_hash_table_remove(old->children, child_name);
 654            stole_child = true;
 655        }
 656    } else if (op->create_dirs) {
 657        assert(op->mutating);
 658
 659        if (!can_access(op->dom_id, old->perms, "wb")) {
 660            err = EACCES;
 661            goto out;
 662        }
 663
 664        if (op->dom_id && op->new_nr_nodes >= XS_MAX_DOMAIN_NODES) {
 665            err = ENOSPC;
 666            goto out;
 667        }
 668
 669        child = xs_node_create(child_name, old->perms);
 670        op->new_nr_nodes++;
 671
 672        /*
 673         * If we're creating a new child, we can clearly modify it (and its
 674         * children) in place from here on down.
 675         */
 676        op->inplace = true;
 677    } else {
 678        err = ENOENT;
 679        goto out;
 680    }
 681
 682    /*
 683     * If there's a watch on this node, add it to the list to be fired
 684     * (with the correct full pathname for the modified node) at the end.
 685     */
 686    if (watch) {
 687        op->watches = g_list_append(op->watches, watch);
 688    }
 689
 690    /*
 691     * Except for the temporary child-stealing as noted, our node has not
 692     * changed yet. We don't yet know the overall operation will complete.
 693     */
 694    err = xs_node_walk(&child, op);
 695
 696    if (watch) {
 697        op->watches = g_list_remove(op->watches, watch);
 698    }
 699
 700    if (err || !op->mutating) {
 701        if (stole_child) {
 702            /* Put it back as it was. */
 703            g_hash_table_replace(old->children, g_strdup(child_name), child);
 704        } else {
 705            xs_node_unref(child);
 706        }
 707        goto out;
 708    }
 709
 710    /*
 711     * Now we know the operation has completed successfully and we're on
 712     * the way back up. Make the change, substituting 'child' in the
 713     * node at our level.
 714     */
 715    if (!this_inplace) {
 716        *n = xs_node_copy(old);
 717        xs_node_unref(old);
 718    }
 719
 720    /*
 721     * If we resurrected a deleted_in_tx node, we can mark it as no longer
 722     * deleted now that we know the overall operation has succeeded.
 723     */
 724    if (op->create_dirs && child && child->deleted_in_tx) {
 725        op->new_nr_nodes++;
 726        child->deleted_in_tx = false;
 727    }
 728
 729    /*
 730     * The child may be NULL here, for a remove operation. Either way,
 731     * xs_node_add_child() will do the right thing and return a value
 732     * indicating whether it changed the parent's hash table or not.
 733     *
 734     * We bump the parent gencnt if it adds a child that we *didn't*
 735     * steal from it in the first place, or if child==NULL and was
 736     * thus removed (whether we stole it earlier and didn't put it
 737     * back, or xs_node_add_child() actually removed it now).
 738     */
 739    if ((xs_node_add_child(*n, child_name, child) && !stole_child) || !child) {
 740        (*n)->gencnt++;
 741    }
 742
 743 out:
 744    op->path[namelen] = '\0';
 745    if (!namelen) {
 746        assert(!op->watches);
 747        /*
 748         * On completing the recursion back up the path walk and reaching the
 749         * top, assign the new node count if the operation was successful. If
 750         * the main tree was changed, bump its tx ID so that outstanding
 751         * transactions correctly fail. But don't bump it every time; only
 752         * if it makes a difference.
 753         */
 754        if (!err && op->mutating) {
 755            if (!op->in_transaction) {
 756                if (op->s->root_tx != op->s->last_tx) {
 757                    op->s->root_tx = next_tx(op->s);
 758                }
 759                op->s->nr_nodes = op->new_nr_nodes;
 760            } else {
 761                XsTransaction *tx = g_hash_table_lookup(op->s->transactions,
 762                                                        GINT_TO_POINTER(op->tx_id));
 763                assert(tx);
 764                tx->nr_nodes = op->new_nr_nodes;
 765            }
 766        }
 767    }
 768    return err;
 769}
 770
 771static void append_directory_item(gpointer key, gpointer value,
 772                                  gpointer user_data)
 773{
 774    GList **items = user_data;
 775
 776    *items = g_list_insert_sorted(*items, g_strdup(key), (GCompareFunc)strcmp);
 777}
 778
 779/* Populates items with char * names which caller must free. */
 780static int xs_node_directory(XsNode **n, struct walk_op *op)
 781{
 782    GList **items = op->op_opaque;
 783
 784    assert(op->inplace);
 785    assert(*n);
 786
 787    if ((*n)->children) {
 788        g_hash_table_foreach((*n)->children, append_directory_item, items);
 789    }
 790
 791    if (op->op_opaque2) {
 792        *(uint64_t *)op->op_opaque2 = (*n)->gencnt;
 793    }
 794
 795    return 0;
 796}
 797
 798static int validate_path(char *outpath, const char *userpath,
 799                         unsigned int dom_id)
 800{
 801    size_t i, pathlen = strlen(userpath);
 802
 803    if (!pathlen || userpath[pathlen] == '/' || strstr(userpath, "//")) {
 804        return EINVAL;
 805    }
 806    for (i = 0; i < pathlen; i++) {
 807        if (!strchr(XS_VALID_CHARS, userpath[i])) {
 808            return EINVAL;
 809        }
 810    }
 811    if (userpath[0] == '/') {
 812        if (pathlen > XENSTORE_ABS_PATH_MAX) {
 813            return E2BIG;
 814        }
 815        memcpy(outpath, userpath, pathlen + 1);
 816    } else {
 817        if (pathlen > XENSTORE_REL_PATH_MAX) {
 818            return E2BIG;
 819        }
 820        snprintf(outpath, XENSTORE_ABS_PATH_MAX, "/local/domain/%u/%s", dom_id,
 821                 userpath);
 822    }
 823    return 0;
 824}
 825
 826
 827static int init_walk_op(XenstoreImplState *s, struct walk_op *op,
 828                        xs_transaction_t tx_id, unsigned int dom_id,
 829                        const char *path, XsNode ***rootp)
 830{
 831    int ret = validate_path(op->path, path, dom_id);
 832    if (ret) {
 833        return ret;
 834    }
 835
 836    /*
 837     * We use *two* NUL terminators at the end of the path, as during the walk
 838     * we will temporarily turn each '/' into a NUL to allow us to use that
 839     * path element for the lookup.
 840     */
 841    op->path[strlen(op->path) + 1] = '\0';
 842    op->watches = NULL;
 843    op->path[0] = '\0';
 844    op->inplace = true;
 845    op->mutating = false;
 846    op->create_dirs = false;
 847    op->in_transaction = false;
 848    op->dom_id = dom_id;
 849    op->tx_id = tx_id;
 850    op->s = s;
 851
 852    if (tx_id == XBT_NULL) {
 853        *rootp = &s->root;
 854        op->new_nr_nodes = s->nr_nodes;
 855    } else {
 856        XsTransaction *tx = g_hash_table_lookup(s->transactions,
 857                                                GINT_TO_POINTER(tx_id));
 858        if (!tx) {
 859            return ENOENT;
 860        }
 861        *rootp = &tx->root;
 862        op->new_nr_nodes = tx->nr_nodes;
 863        op->in_transaction = true;
 864    }
 865
 866    return 0;
 867}
 868
 869int xs_impl_read(XenstoreImplState *s, unsigned int dom_id,
 870                 xs_transaction_t tx_id, const char *path, GByteArray *data)
 871{
 872    /*
 873     * The data GByteArray shall exist, and will be freed by caller.
 874     * Just g_byte_array_append() to it.
 875     */
 876    struct walk_op op;
 877    XsNode **n;
 878    int ret;
 879
 880    ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
 881    if (ret) {
 882        return ret;
 883    }
 884    op.op_fn = xs_node_get_content;
 885    op.op_opaque = data;
 886    return xs_node_walk(n, &op);
 887}
 888
 889int xs_impl_write(XenstoreImplState *s, unsigned int dom_id,
 890                  xs_transaction_t tx_id, const char *path, GByteArray *data)
 891{
 892    /*
 893     * The data GByteArray shall exist, will be freed by caller. You are
 894     * free to use g_byte_array_steal() and keep the data. Or just ref it.
 895     */
 896    struct walk_op op;
 897    XsNode **n;
 898    int ret;
 899
 900    ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
 901    if (ret) {
 902        return ret;
 903    }
 904    op.op_fn = xs_node_add_content;
 905    op.op_opaque = data;
 906    op.mutating = true;
 907    op.create_dirs = true;
 908    return xs_node_walk(n, &op);
 909}
 910
 911int xs_impl_directory(XenstoreImplState *s, unsigned int dom_id,
 912                      xs_transaction_t tx_id, const char *path,
 913                      uint64_t *gencnt, GList **items)
 914{
 915    /*
 916     * The items are (char *) to be freed by caller. Although it's consumed
 917     * immediately so if you want to change it to (const char *) and keep
 918     * them, go ahead and change the caller.
 919     */
 920    struct walk_op op;
 921    XsNode **n;
 922    int ret;
 923
 924    ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
 925    if (ret) {
 926        return ret;
 927    }
 928    op.op_fn = xs_node_directory;
 929    op.op_opaque = items;
 930    op.op_opaque2 = gencnt;
 931    return xs_node_walk(n, &op);
 932}
 933
 934int xs_impl_transaction_start(XenstoreImplState *s, unsigned int dom_id,
 935                              xs_transaction_t *tx_id)
 936{
 937    XsTransaction *tx;
 938
 939    if (*tx_id != XBT_NULL) {
 940        return EINVAL;
 941    }
 942
 943    if (dom_id && s->nr_domu_transactions >= XS_MAX_TRANSACTIONS) {
 944        return ENOSPC;
 945    }
 946
 947    tx = g_new0(XsTransaction, 1);
 948
 949    tx->nr_nodes = s->nr_nodes;
 950    tx->tx_id = next_tx(s);
 951    tx->base_tx = s->root_tx;
 952    tx->root = xs_node_ref(s->root);
 953    tx->dom_id = dom_id;
 954
 955    g_hash_table_insert(s->transactions, GINT_TO_POINTER(tx->tx_id), tx);
 956    if (dom_id) {
 957        s->nr_domu_transactions++;
 958    }
 959    *tx_id = tx->tx_id;
 960    return 0;
 961}
 962
 963static gboolean tx_commit_walk(gpointer key, gpointer value,
 964                               gpointer user_data)
 965{
 966    struct walk_op *op = user_data;
 967    int path_len = strlen(op->path);
 968    int key_len = strlen(key);
 969    bool fire_parents = true;
 970    XsWatch *watch;
 971    XsNode *n = value;
 972
 973    if (n->ref != 1) {
 974        return false;
 975    }
 976
 977    if (n->deleted_in_tx) {
 978        /*
 979         * We fire watches on our parents if we are the *first* node
 980         * to be deleted (the topmost one). This matches the behaviour
 981         * when deleting in the live tree.
 982         */
 983        fire_parents = !op->deleted_in_tx;
 984
 985        /* Only used on the way down so no need to clear it later */
 986        op->deleted_in_tx = true;
 987    }
 988
 989    assert(key_len + path_len + 2 <= sizeof(op->path));
 990    op->path[path_len] = '/';
 991    memcpy(op->path + path_len + 1, key, key_len + 1);
 992
 993    watch = g_hash_table_lookup(op->s->watches, op->path);
 994    if (watch) {
 995        op->watches = g_list_append(op->watches, watch);
 996    }
 997
 998    if (n->children) {
 999        g_hash_table_foreach_remove(n->children, tx_commit_walk, op);
1000    }
1001
1002    if (watch) {
1003        op->watches = g_list_remove(op->watches, watch);
1004    }
1005
1006    /*
1007     * Don't fire watches if this node was only copied because a
1008     * descendent was changed. The modified_in_tx flag indicates the
1009     * ones which were really changed.
1010     */
1011    if (n->modified_in_tx || n->deleted_in_tx) {
1012        fire_watches(op, fire_parents);
1013        n->modified_in_tx = false;
1014    }
1015    op->path[path_len] = '\0';
1016
1017    /* Deleted nodes really do get expunged when we commit */
1018    return n->deleted_in_tx;
1019}
1020
1021static int transaction_commit(XenstoreImplState *s, XsTransaction *tx)
1022{
1023    struct walk_op op;
1024    XsNode **n;
1025
1026    if (s->root_tx != tx->base_tx) {
1027        return EAGAIN;
1028    }
1029    xs_node_unref(s->root);
1030    s->root = tx->root;
1031    tx->root = NULL;
1032    s->root_tx = tx->tx_id;
1033    s->nr_nodes = tx->nr_nodes;
1034
1035    init_walk_op(s, &op, XBT_NULL, tx->dom_id, "/", &n);
1036    op.deleted_in_tx = false;
1037    op.mutating = true;
1038
1039    /*
1040     * Walk the new root and fire watches on any node which has a
1041     * refcount of one (which is therefore unique to this transaction).
1042     */
1043    if (s->root->children) {
1044        g_hash_table_foreach_remove(s->root->children, tx_commit_walk, &op);
1045    }
1046
1047    return 0;
1048}
1049
1050int xs_impl_transaction_end(XenstoreImplState *s, unsigned int dom_id,
1051                            xs_transaction_t tx_id, bool commit)
1052{
1053    int ret = 0;
1054    XsTransaction *tx = g_hash_table_lookup(s->transactions,
1055                                            GINT_TO_POINTER(tx_id));
1056
1057    if (!tx || tx->dom_id != dom_id) {
1058        return ENOENT;
1059    }
1060
1061    if (commit) {
1062        ret = transaction_commit(s, tx);
1063    }
1064
1065    g_hash_table_remove(s->transactions, GINT_TO_POINTER(tx_id));
1066    if (dom_id) {
1067        assert(s->nr_domu_transactions);
1068        s->nr_domu_transactions--;
1069    }
1070    return ret;
1071}
1072
1073int xs_impl_rm(XenstoreImplState *s, unsigned int dom_id,
1074               xs_transaction_t tx_id, const char *path)
1075{
1076    struct walk_op op;
1077    XsNode **n;
1078    int ret;
1079
1080    ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
1081    if (ret) {
1082        return ret;
1083    }
1084    op.op_fn = xs_node_rm;
1085    op.mutating = true;
1086    return xs_node_walk(n, &op);
1087}
1088
1089int xs_impl_get_perms(XenstoreImplState *s, unsigned int dom_id,
1090                      xs_transaction_t tx_id, const char *path, GList **perms)
1091{
1092    struct walk_op op;
1093    XsNode **n;
1094    int ret;
1095
1096    ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
1097    if (ret) {
1098        return ret;
1099    }
1100    op.op_fn = xs_node_get_perms;
1101    op.op_opaque = perms;
1102    return xs_node_walk(n, &op);
1103}
1104
1105static void is_valid_perm(gpointer data, gpointer user_data)
1106{
1107    char *perm = data;
1108    bool *valid = user_data;
1109    char letter;
1110    unsigned int dom_id;
1111
1112    if (!*valid) {
1113        return;
1114    }
1115
1116    if (sscanf(perm, "%c%u", &letter, &dom_id) != 2) {
1117        *valid = false;
1118        return;
1119    }
1120
1121    switch (letter) {
1122    case 'n':
1123    case 'r':
1124    case 'w':
1125    case 'b':
1126        break;
1127
1128    default:
1129        *valid = false;
1130        break;
1131    }
1132}
1133
1134int xs_impl_set_perms(XenstoreImplState *s, unsigned int dom_id,
1135                      xs_transaction_t tx_id, const char *path, GList *perms)
1136{
1137    struct walk_op op;
1138    XsNode **n;
1139    bool valid = true;
1140    int ret;
1141
1142    if (!g_list_length(perms)) {
1143        return EINVAL;
1144    }
1145
1146    g_list_foreach(perms, is_valid_perm, &valid);
1147    if (!valid) {
1148        return EINVAL;
1149    }
1150
1151    ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
1152    if (ret) {
1153        return ret;
1154    }
1155    op.op_fn = xs_node_set_perms;
1156    op.op_opaque = perms;
1157    op.mutating = true;
1158    return xs_node_walk(n, &op);
1159}
1160
1161static int do_xs_impl_watch(XenstoreImplState *s, unsigned int dom_id,
1162                            const char *path, const char *token,
1163                            xs_impl_watch_fn fn, void *opaque)
1164
1165{
1166    char abspath[XENSTORE_ABS_PATH_MAX + 1];
1167    XsWatch *w, *l;
1168    int ret;
1169
1170    ret = validate_path(abspath, path, dom_id);
1171    if (ret) {
1172        return ret;
1173    }
1174
1175    /* Check for duplicates */
1176    l = w = g_hash_table_lookup(s->watches, abspath);
1177    while (w) {
1178        if (!g_strcmp0(token, w->token) &&  opaque == w->cb_opaque &&
1179            fn == w->cb && dom_id == w->dom_id) {
1180            return EEXIST;
1181        }
1182        w = w->next;
1183    }
1184
1185    if (dom_id && s->nr_domu_watches >= XS_MAX_WATCHES) {
1186        return E2BIG;
1187    }
1188
1189    w = g_new0(XsWatch, 1);
1190    w->token = g_strdup(token);
1191    w->cb = fn;
1192    w->cb_opaque = opaque;
1193    w->dom_id = dom_id;
1194    w->rel_prefix = strlen(abspath) - strlen(path);
1195
1196    /* l was looked up above when checking for duplicates */
1197    if (l) {
1198        w->next = l->next;
1199        l->next = w;
1200    } else {
1201        g_hash_table_insert(s->watches, g_strdup(abspath), w);
1202    }
1203    if (dom_id) {
1204        s->nr_domu_watches++;
1205    }
1206
1207    return 0;
1208}
1209
1210int xs_impl_watch(XenstoreImplState *s, unsigned int dom_id, const char *path,
1211                  const char *token, xs_impl_watch_fn fn, void *opaque)
1212{
1213    int ret = do_xs_impl_watch(s, dom_id, path, token, fn, opaque);
1214
1215    if (!ret) {
1216        /* A new watch should fire immediately */
1217        fn(opaque, path, token);
1218    }
1219
1220    return ret;
1221}
1222
1223static XsWatch *free_watch(XenstoreImplState *s, XsWatch *w)
1224{
1225    XsWatch *next = w->next;
1226
1227    if (w->dom_id) {
1228        assert(s->nr_domu_watches);
1229        s->nr_domu_watches--;
1230    }
1231
1232    g_free(w->token);
1233    g_free(w);
1234
1235    return next;
1236}
1237
1238int xs_impl_unwatch(XenstoreImplState *s, unsigned int dom_id,
1239                    const char *path, const char *token,
1240                    xs_impl_watch_fn fn, void *opaque)
1241{
1242    char abspath[XENSTORE_ABS_PATH_MAX + 1];
1243    XsWatch *w, **l;
1244    int ret;
1245
1246    ret = validate_path(abspath, path, dom_id);
1247    if (ret) {
1248        return ret;
1249    }
1250
1251    w = g_hash_table_lookup(s->watches, abspath);
1252    if (!w) {
1253        return ENOENT;
1254    }
1255
1256    /*
1257     * The hash table contains the first element of a list of
1258     * watches. Removing the first element in the list is a
1259     * special case because we have to update the hash table to
1260     * point to the next (or remove it if there's nothing left).
1261     */
1262    if (!g_strcmp0(token, w->token) && fn == w->cb && opaque == w->cb_opaque &&
1263        dom_id == w->dom_id) {
1264        if (w->next) {
1265            /* Insert the previous 'next' into the hash table */
1266            g_hash_table_insert(s->watches, g_strdup(abspath), w->next);
1267        } else {
1268            /* Nothing left; remove from hash table */
1269            g_hash_table_remove(s->watches, abspath);
1270        }
1271        free_watch(s, w);
1272        return 0;
1273    }
1274
1275    /*
1276     * We're all done messing with the hash table because the element
1277     * it points to has survived the cull. Now it's just a simple
1278     * linked list removal operation.
1279     */
1280    for (l = &w->next; *l; l = &w->next) {
1281        w = *l;
1282
1283        if (!g_strcmp0(token, w->token) && fn == w->cb &&
1284            opaque != w->cb_opaque && dom_id == w->dom_id) {
1285            *l = free_watch(s, w);
1286            return 0;
1287        }
1288    }
1289
1290    return ENOENT;
1291}
1292
1293int xs_impl_reset_watches(XenstoreImplState *s, unsigned int dom_id)
1294{
1295    char **watch_paths;
1296    guint nr_watch_paths;
1297    guint i;
1298
1299    watch_paths = (char **)g_hash_table_get_keys_as_array(s->watches,
1300                                                          &nr_watch_paths);
1301
1302    for (i = 0; i < nr_watch_paths; i++) {
1303        XsWatch *w1 = g_hash_table_lookup(s->watches, watch_paths[i]);
1304        XsWatch *w2, *w, **l;
1305
1306        /*
1307         * w1 is the original list. The hash table has this pointer.
1308         * w2 is the head of our newly-filtered list.
1309         * w and l are temporary for processing. w is somewhat redundant
1310         * with *l but makes my eyes bleed less.
1311         */
1312
1313        w = w2 = w1;
1314        l = &w;
1315        while (w) {
1316            if (w->dom_id == dom_id) {
1317                /* If we're freeing the head of the list, bump w2 */
1318                if (w2 == w) {
1319                    w2 = w->next;
1320                }
1321                *l = free_watch(s, w);
1322            } else {
1323                l = &w->next;
1324            }
1325            w = *l;
1326        }
1327        /*
1328         * If the head of the list survived the cull, we don't need to
1329         * touch the hash table and we're done with this path. Else...
1330         */
1331        if (w1 != w2) {
1332            g_hash_table_steal(s->watches, watch_paths[i]);
1333
1334            /*
1335             * It was already freed. (Don't worry, this whole thing is
1336             * single-threaded and nobody saw it in the meantime). And
1337             * having *stolen* it, we now own the watch_paths[i] string
1338             * so if we don't give it back to the hash table, we need
1339             * to free it.
1340             */
1341            if (w2) {
1342                g_hash_table_insert(s->watches, watch_paths[i], w2);
1343            } else {
1344                g_free(watch_paths[i]);
1345            }
1346        }
1347    }
1348    g_free(watch_paths);
1349    return 0;
1350}
1351
1352static void xs_tx_free(void *_tx)
1353{
1354    XsTransaction *tx = _tx;
1355    if (tx->root) {
1356        xs_node_unref(tx->root);
1357    }
1358    g_free(tx);
1359}
1360
1361XenstoreImplState *xs_impl_create(unsigned int dom_id)
1362{
1363    XenstoreImplState *s = g_new0(XenstoreImplState, 1);
1364    GList *perms;
1365
1366    s->watches = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, NULL);
1367    s->transactions = g_hash_table_new_full(g_direct_hash, g_direct_equal,
1368                                            NULL, xs_tx_free);
1369
1370    perms = g_list_append(NULL, xs_perm_as_string(XS_PERM_NONE, 0));
1371    s->root = xs_node_create("/", perms);
1372    g_list_free_full(perms, g_free);
1373    s->nr_nodes = 1;
1374
1375    s->root_tx = s->last_tx = 1;
1376    return s;
1377}
1378
1379
1380static void clear_serialized_tx(gpointer key, gpointer value, gpointer opaque)
1381{
1382    XsNode *n = value;
1383
1384    n->serialized_tx = XBT_NULL;
1385    if (n->children) {
1386        g_hash_table_foreach(n->children, clear_serialized_tx, NULL);
1387    }
1388}
1389
1390static void clear_tx_serialized_tx(gpointer key, gpointer value,
1391                                   gpointer opaque)
1392{
1393    XsTransaction *t = value;
1394
1395    clear_serialized_tx(NULL, t->root, NULL);
1396}
1397
1398static void write_be32(GByteArray *save, uint32_t val)
1399{
1400    uint32_t be = htonl(val);
1401    g_byte_array_append(save, (void *)&be, sizeof(be));
1402}
1403
1404
1405struct save_state {
1406    GByteArray *bytes;
1407    unsigned int tx_id;
1408};
1409
1410#define MODIFIED_IN_TX  (1U << 0)
1411#define DELETED_IN_TX   (1U << 1)
1412#define NODE_REF        (1U << 2)
1413
1414static void save_node(gpointer key, gpointer value, gpointer opaque)
1415{
1416    struct save_state *ss = opaque;
1417    XsNode *n = value;
1418    char *name = key;
1419    uint8_t flag = 0;
1420
1421    /* Child nodes (i.e. anything but the root) have a name */
1422    if (name) {
1423        g_byte_array_append(ss->bytes, key, strlen(key) + 1);
1424    }
1425
1426    /*
1427     * If we already wrote this node, refer to the previous copy.
1428     * There's no rename/move in XenStore, so all we need to find
1429     * it is the tx_id of the transation in which it exists. Which
1430     * may be the root tx.
1431     */
1432    if (n->serialized_tx != XBT_NULL) {
1433        flag = NODE_REF;
1434        g_byte_array_append(ss->bytes, &flag, 1);
1435        write_be32(ss->bytes, n->serialized_tx);
1436    } else {
1437        GList *l;
1438        n->serialized_tx = ss->tx_id;
1439
1440        if (n->modified_in_tx) {
1441            flag |= MODIFIED_IN_TX;
1442        }
1443        if (n->deleted_in_tx) {
1444            flag |= DELETED_IN_TX;
1445        }
1446        g_byte_array_append(ss->bytes, &flag, 1);
1447
1448        if (n->content) {
1449            write_be32(ss->bytes, n->content->len);
1450            g_byte_array_append(ss->bytes, n->content->data, n->content->len);
1451        } else {
1452            write_be32(ss->bytes, 0);
1453        }
1454
1455        for (l = n->perms; l; l = l->next) {
1456            g_byte_array_append(ss->bytes, l->data, strlen(l->data) + 1);
1457        }
1458        /* NUL termination after perms */
1459        g_byte_array_append(ss->bytes, (void *)"", 1);
1460
1461        if (n->children) {
1462            g_hash_table_foreach(n->children, save_node, ss);
1463        }
1464        /* NUL termination after children (child name is NUL) */
1465        g_byte_array_append(ss->bytes, (void *)"", 1);
1466    }
1467}
1468
1469static void save_tree(struct save_state *ss, uint32_t tx_id, XsNode *root)
1470{
1471    write_be32(ss->bytes, tx_id);
1472    ss->tx_id = tx_id;
1473    save_node(NULL, root, ss);
1474}
1475
1476static void save_tx(gpointer key, gpointer value, gpointer opaque)
1477{
1478    uint32_t tx_id = GPOINTER_TO_INT(key);
1479    struct save_state *ss = opaque;
1480    XsTransaction *n = value;
1481
1482    write_be32(ss->bytes, n->base_tx);
1483    write_be32(ss->bytes, n->dom_id);
1484
1485    save_tree(ss, tx_id, n->root);
1486}
1487
1488static void save_watch(gpointer key, gpointer value, gpointer opaque)
1489{
1490    struct save_state *ss = opaque;
1491    XsWatch *w = value;
1492
1493    /* We only save the *guest* watches. */
1494    if (w->dom_id) {
1495        gpointer relpath = key + w->rel_prefix;
1496        g_byte_array_append(ss->bytes, relpath, strlen(relpath) + 1);
1497        g_byte_array_append(ss->bytes, (void *)w->token, strlen(w->token) + 1);
1498    }
1499}
1500
1501GByteArray *xs_impl_serialize(XenstoreImplState *s)
1502{
1503    struct save_state ss;
1504
1505    ss.bytes = g_byte_array_new();
1506
1507    /*
1508     * node = flags [ real_node / node_ref ]
1509     *   flags = uint8_t (MODIFIED_IN_TX | DELETED_IN_TX | NODE_REF)
1510     *   node_ref = tx_id (in which the original version of this node exists)
1511     *   real_node = content perms child* NUL
1512     *     content = len data
1513     *       len = uint32_t
1514     *       data = uint8_t{len}
1515     *     perms = perm* NUL
1516     *       perm = asciiz
1517     *   child = name node
1518     *     name = asciiz
1519     *
1520     * tree = tx_id node
1521     *   tx_id = uint32_t
1522     *
1523     * transaction = base_tx_id dom_id tree
1524     *   base_tx_id = uint32_t
1525     *   dom_id = uint32_t
1526     *
1527     * tx_list = tree transaction* XBT_NULL
1528     *
1529     * watch = path token
1530     *   path = asciiz
1531     *   token = asciiz
1532     *
1533     * watch_list = watch* NUL
1534     *
1535     * xs_serialize_stream = last_tx tx_list watch_list
1536     *   last_tx = uint32_t
1537     */
1538
1539    /* Clear serialized_tx in every node. */
1540    if (s->serialized) {
1541        clear_serialized_tx(NULL, s->root, NULL);
1542        g_hash_table_foreach(s->transactions, clear_tx_serialized_tx, NULL);
1543    }
1544
1545    s->serialized = true;
1546
1547    write_be32(ss.bytes, s->last_tx);
1548    save_tree(&ss, s->root_tx, s->root);
1549    g_hash_table_foreach(s->transactions, save_tx, &ss);
1550
1551    write_be32(ss.bytes, XBT_NULL);
1552
1553    g_hash_table_foreach(s->watches, save_watch, &ss);
1554    g_byte_array_append(ss.bytes, (void *)"", 1);
1555
1556    return ss.bytes;
1557}
1558
1559struct unsave_state {
1560    char path[XENSTORE_ABS_PATH_MAX + 1];
1561    XenstoreImplState *s;
1562    GByteArray *bytes;
1563    uint8_t *d;
1564    size_t l;
1565    bool root_walk;
1566};
1567
1568static int consume_be32(struct unsave_state *us, unsigned int *val)
1569{
1570    uint32_t d;
1571
1572    if (us->l < sizeof(d)) {
1573        return -EINVAL;
1574    }
1575    memcpy(&d, us->d, sizeof(d));
1576    *val = ntohl(d);
1577    us->d += sizeof(d);
1578    us->l -= sizeof(d);
1579    return 0;
1580}
1581
1582static int consume_string(struct unsave_state *us, char **str, size_t *len)
1583{
1584    size_t l;
1585
1586    if (!us->l) {
1587        return -EINVAL;
1588    }
1589
1590    l = strnlen((void *)us->d, us->l);
1591    if (l == us->l) {
1592        return -EINVAL;
1593    }
1594
1595    if (str) {
1596        *str = (void *)us->d;
1597    }
1598    if (len) {
1599        *len = l;
1600    }
1601
1602    us->d += l + 1;
1603    us->l -= l + 1;
1604    return 0;
1605}
1606
1607static XsNode *lookup_node(XsNode *n, char *path)
1608{
1609    char *slash = strchr(path, '/');
1610    XsNode *child;
1611
1612    if (path[0] == '\0') {
1613        return n;
1614    }
1615
1616    if (slash) {
1617        *slash = '\0';
1618    }
1619
1620    if (!n->children) {
1621        return NULL;
1622    }
1623    child = g_hash_table_lookup(n->children, path);
1624    if (!slash) {
1625        return child;
1626    }
1627
1628    *slash = '/';
1629    if (!child) {
1630        return NULL;
1631    }
1632    return lookup_node(child, slash + 1);
1633}
1634
1635static XsNode *lookup_tx_node(struct unsave_state *us, unsigned int tx_id)
1636{
1637    XsTransaction *t;
1638    if (tx_id == us->s->root_tx) {
1639        return lookup_node(us->s->root, us->path + 1);
1640    }
1641
1642    t = g_hash_table_lookup(us->s->transactions, GINT_TO_POINTER(tx_id));
1643    if (!t) {
1644        return NULL;
1645    }
1646    g_assert(t->root);
1647    return lookup_node(t->root, us->path + 1);
1648}
1649
1650static void count_child_nodes(gpointer key, gpointer value, gpointer user_data)
1651{
1652    unsigned int *nr_nodes = user_data;
1653    XsNode *n = value;
1654
1655    (*nr_nodes)++;
1656
1657    if (n->children) {
1658        g_hash_table_foreach(n->children, count_child_nodes, nr_nodes);
1659    }
1660}
1661
1662static int consume_node(struct unsave_state *us, XsNode **nodep,
1663                        unsigned int *nr_nodes)
1664{
1665    XsNode *n = NULL;
1666    uint8_t flags;
1667    int ret;
1668
1669    if (us->l < 1) {
1670        return -EINVAL;
1671    }
1672    flags = us->d[0];
1673    us->d++;
1674    us->l--;
1675
1676    if (flags == NODE_REF) {
1677        unsigned int tx;
1678
1679        ret = consume_be32(us, &tx);
1680        if (ret) {
1681            return ret;
1682        }
1683
1684        n = lookup_tx_node(us, tx);
1685        if (!n) {
1686            return -EINVAL;
1687        }
1688        n->ref++;
1689        if (n->children) {
1690            g_hash_table_foreach(n->children, count_child_nodes, nr_nodes);
1691        }
1692    } else {
1693        uint32_t datalen;
1694
1695        if (flags & ~(DELETED_IN_TX | MODIFIED_IN_TX)) {
1696            return -EINVAL;
1697        }
1698        n = xs_node_new();
1699
1700        if (flags & DELETED_IN_TX) {
1701            n->deleted_in_tx = true;
1702        }
1703        if (flags & MODIFIED_IN_TX) {
1704            n->modified_in_tx = true;
1705        }
1706        ret = consume_be32(us, &datalen);
1707        if (ret) {
1708            xs_node_unref(n);
1709            return -EINVAL;
1710        }
1711        if (datalen) {
1712            if (datalen > us->l) {
1713                xs_node_unref(n);
1714                return -EINVAL;
1715            }
1716
1717            GByteArray *node_data = g_byte_array_new();
1718            g_byte_array_append(node_data, us->d, datalen);
1719            us->d += datalen;
1720            us->l -= datalen;
1721            n->content = node_data;
1722
1723            if (us->root_walk) {
1724                n->modified_in_tx = true;
1725            }
1726        }
1727        while (1) {
1728            char *perm = NULL;
1729            size_t permlen = 0;
1730
1731            ret = consume_string(us, &perm, &permlen);
1732            if (ret) {
1733                xs_node_unref(n);
1734                return ret;
1735            }
1736
1737            if (!permlen) {
1738                break;
1739            }
1740
1741            n->perms = g_list_append(n->perms, g_strdup(perm));
1742        }
1743
1744        /* Now children */
1745        while (1) {
1746            size_t childlen;
1747            char *childname;
1748            char *pathend;
1749            XsNode *child = NULL;
1750
1751            ret = consume_string(us, &childname, &childlen);
1752            if (ret) {
1753                xs_node_unref(n);
1754                return ret;
1755            }
1756
1757            if (!childlen) {
1758                break;
1759            }
1760
1761            pathend = us->path + strlen(us->path);
1762            strncat(us->path, "/", sizeof(us->path) - 1);
1763            strncat(us->path, childname, sizeof(us->path) - 1);
1764
1765            ret = consume_node(us, &child, nr_nodes);
1766            *pathend = '\0';
1767            if (ret) {
1768                xs_node_unref(n);
1769                return ret;
1770            }
1771            g_assert(child);
1772            xs_node_add_child(n, childname, child);
1773        }
1774
1775        /*
1776         * If the node has no data and no children we still want to fire
1777         * a watch on it.
1778         */
1779        if (us->root_walk && !n->children) {
1780            n->modified_in_tx = true;
1781        }
1782    }
1783
1784    if (!n->deleted_in_tx) {
1785        (*nr_nodes)++;
1786    }
1787
1788    *nodep = n;
1789    return 0;
1790}
1791
1792static int consume_tree(struct unsave_state *us, XsTransaction *t)
1793{
1794    int ret;
1795
1796    ret = consume_be32(us, &t->tx_id);
1797    if (ret) {
1798        return ret;
1799    }
1800
1801    if (t->tx_id > us->s->last_tx) {
1802        return -EINVAL;
1803    }
1804
1805    us->path[0] = '\0';
1806
1807    return consume_node(us, &t->root, &t->nr_nodes);
1808}
1809
1810int xs_impl_deserialize(XenstoreImplState *s, GByteArray *bytes,
1811                        unsigned int dom_id, xs_impl_watch_fn watch_fn,
1812                        void *watch_opaque)
1813{
1814    struct unsave_state us;
1815    XsTransaction base_t = { 0 };
1816    int ret;
1817
1818    us.s = s;
1819    us.bytes = bytes;
1820    us.d = bytes->data;
1821    us.l = bytes->len;
1822
1823    xs_impl_reset_watches(s, dom_id);
1824    g_hash_table_remove_all(s->transactions);
1825
1826    xs_node_unref(s->root);
1827    s->root = NULL;
1828    s->root_tx = s->last_tx = XBT_NULL;
1829
1830    ret = consume_be32(&us, &s->last_tx);
1831    if (ret) {
1832        return ret;
1833    }
1834
1835    /*
1836     * Consume the base tree into a transaction so that watches can be
1837     * fired as we commit it. By setting us.root_walk we cause the nodes
1838     * to be marked as 'modified_in_tx' as they are created, so that the
1839     * watches are triggered on them.
1840     */
1841    base_t.dom_id = dom_id;
1842    base_t.base_tx = XBT_NULL;
1843    us.root_walk = true;
1844    ret = consume_tree(&us, &base_t);
1845    if (ret) {
1846        return ret;
1847    }
1848    us.root_walk = false;
1849
1850    /*
1851     * Commit the transaction now while the refcount on all nodes is 1.
1852     * Note that we haven't yet reinstated the *guest* watches but that's
1853     * OK because we don't want the guest to see any changes. Even any
1854     * backend nodes which get recreated should be *precisely* as they
1855     * were before the migration. Back ends may have been instantiated
1856     * already, and will see the frontend magically blink into existence
1857     * now (well, from the aio_bh which fires the watches). It's their
1858     * responsibility to rebuild everything precisely as it was before.
1859     */
1860    ret = transaction_commit(s, &base_t);
1861    if (ret) {
1862        return ret;
1863    }
1864
1865    while (1) {
1866        unsigned int base_tx;
1867        XsTransaction *t;
1868
1869        ret = consume_be32(&us, &base_tx);
1870        if (ret) {
1871            return ret;
1872        }
1873        if (base_tx == XBT_NULL) {
1874            break;
1875        }
1876
1877        t = g_new0(XsTransaction, 1);
1878        t->base_tx = base_tx;
1879
1880        ret = consume_be32(&us, &t->dom_id);
1881        if (!ret) {
1882            ret = consume_tree(&us, t);
1883        }
1884        if (ret) {
1885            g_free(t);
1886            return ret;
1887        }
1888        g_assert(t->root);
1889        if (t->dom_id) {
1890            s->nr_domu_transactions++;
1891        }
1892        g_hash_table_insert(s->transactions, GINT_TO_POINTER(t->tx_id), t);
1893    }
1894
1895    while (1) {
1896        char *path, *token;
1897        size_t pathlen, toklen;
1898
1899        ret = consume_string(&us, &path, &pathlen);
1900        if (ret) {
1901            return ret;
1902        }
1903        if (!pathlen) {
1904            break;
1905        }
1906
1907        ret = consume_string(&us, &token, &toklen);
1908        if (ret) {
1909            return ret;
1910        }
1911
1912        if (!watch_fn) {
1913            continue;
1914        }
1915
1916        ret = do_xs_impl_watch(s, dom_id, path, token, watch_fn, watch_opaque);
1917        if (ret) {
1918            return ret;
1919        }
1920    }
1921
1922    if (us.l) {
1923        return -EINVAL;
1924    }
1925
1926    return 0;
1927}
1928