qemu/net/filter-rewriter.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
   3 * Copyright (c) 2016 FUJITSU LIMITED
   4 * Copyright (c) 2016 Intel Corporation
   5 *
   6 * Author: Zhang Chen <zhangchen.fnst@cn.fujitsu.com>
   7 *
   8 * This work is licensed under the terms of the GNU GPL, version 2 or
   9 * later.  See the COPYING file in the top-level directory.
  10 */
  11
  12#include "qemu/osdep.h"
  13#include "trace.h"
  14#include "colo.h"
  15#include "net/filter.h"
  16#include "net/net.h"
  17#include "qemu-common.h"
  18#include "qemu/error-report.h"
  19#include "qom/object.h"
  20#include "qemu/main-loop.h"
  21#include "qemu/iov.h"
  22#include "net/checksum.h"
  23#include "net/colo.h"
  24#include "migration/colo.h"
  25#include "util.h"
  26
  27#define FILTER_COLO_REWRITER(obj) \
  28    OBJECT_CHECK(RewriterState, (obj), TYPE_FILTER_REWRITER)
  29
  30#define TYPE_FILTER_REWRITER "filter-rewriter"
  31#define FAILOVER_MODE_ON  true
  32#define FAILOVER_MODE_OFF false
  33
  34typedef struct RewriterState {
  35    NetFilterState parent_obj;
  36    NetQueue *incoming_queue;
  37    /* hashtable to save connection */
  38    GHashTable *connection_track_table;
  39    bool vnet_hdr;
  40    bool failover_mode;
  41} RewriterState;
  42
  43static void filter_rewriter_failover_mode(RewriterState *s)
  44{
  45    s->failover_mode = FAILOVER_MODE_ON;
  46}
  47
  48static void filter_rewriter_flush(NetFilterState *nf)
  49{
  50    RewriterState *s = FILTER_COLO_REWRITER(nf);
  51
  52    if (!qemu_net_queue_flush(s->incoming_queue)) {
  53        /* Unable to empty the queue, purge remaining packets */
  54        qemu_net_queue_purge(s->incoming_queue, nf->netdev);
  55    }
  56}
  57
  58/*
  59 * Return 1 on success, if return 0 means the pkt
  60 * is not TCP packet
  61 */
  62static int is_tcp_packet(Packet *pkt)
  63{
  64    if (!parse_packet_early(pkt) &&
  65        pkt->ip->ip_p == IPPROTO_TCP) {
  66        return 1;
  67    } else {
  68        return 0;
  69    }
  70}
  71
  72/* handle tcp packet from primary guest */
  73static int handle_primary_tcp_pkt(RewriterState *rf,
  74                                  Connection *conn,
  75                                  Packet *pkt, ConnectionKey *key)
  76{
  77    struct tcp_hdr *tcp_pkt;
  78
  79    tcp_pkt = (struct tcp_hdr *)pkt->transport_header;
  80    if (trace_event_get_state_backends(TRACE_COLO_FILTER_REWRITER_DEBUG)) {
  81        trace_colo_filter_rewriter_pkt_info(__func__,
  82                    inet_ntoa(pkt->ip->ip_src), inet_ntoa(pkt->ip->ip_dst),
  83                    ntohl(tcp_pkt->th_seq), ntohl(tcp_pkt->th_ack),
  84                    tcp_pkt->th_flags);
  85        trace_colo_filter_rewriter_conn_offset(conn->offset);
  86    }
  87
  88    if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == (TH_ACK | TH_SYN)) &&
  89        conn->tcp_state == TCPS_SYN_SENT) {
  90        conn->tcp_state = TCPS_ESTABLISHED;
  91    }
  92
  93    if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_SYN)) {
  94        /*
  95         * we use this flag update offset func
  96         * run once in independent tcp connection
  97         */
  98        conn->tcp_state = TCPS_SYN_RECEIVED;
  99    }
 100
 101    if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_ACK)) {
 102        if (conn->tcp_state == TCPS_SYN_RECEIVED) {
 103            /*
 104             * offset = secondary_seq - primary seq
 105             * ack packet sent by guest from primary node,
 106             * so we use th_ack - 1 get primary_seq
 107             */
 108            conn->offset -= (ntohl(tcp_pkt->th_ack) - 1);
 109            conn->tcp_state = TCPS_ESTABLISHED;
 110        }
 111        if (conn->offset) {
 112            /* handle packets to the secondary from the primary */
 113            tcp_pkt->th_ack = htonl(ntohl(tcp_pkt->th_ack) + conn->offset);
 114
 115            net_checksum_calculate((uint8_t *)pkt->data + pkt->vnet_hdr_len,
 116                                   pkt->size - pkt->vnet_hdr_len);
 117        }
 118
 119        /*
 120         * Passive close step 3
 121         */
 122        if ((conn->tcp_state == TCPS_LAST_ACK) &&
 123            (ntohl(tcp_pkt->th_ack) == (conn->fin_ack_seq + 1))) {
 124            conn->tcp_state = TCPS_CLOSED;
 125            g_hash_table_remove(rf->connection_track_table, key);
 126        }
 127    }
 128
 129    if ((tcp_pkt->th_flags & TH_FIN) == TH_FIN) {
 130        /*
 131         * Passive close.
 132         * Step 1:
 133         * The *server* side of this connect is VM, *client* tries to close
 134         * the connection. We will into CLOSE_WAIT status.
 135         *
 136         * Step 2:
 137         * In this step we will into LAST_ACK status.
 138         *
 139         * We got 'fin=1, ack=1' packet from server side, we need to
 140         * record the seq of 'fin=1, ack=1' packet.
 141         *
 142         * Step 3:
 143         * We got 'ack=1' packets from client side, it acks 'fin=1, ack=1'
 144         * packet from server side. From this point, we can ensure that there
 145         * will be no packets in the connection, except that, some errors
 146         * happen between the path of 'filter object' and vNIC, if this rare
 147         * case really happen, we can still create a new connection,
 148         * So it is safe to remove the connection from connection_track_table.
 149         *
 150         */
 151        if (conn->tcp_state == TCPS_ESTABLISHED) {
 152            conn->tcp_state = TCPS_CLOSE_WAIT;
 153        }
 154
 155        /*
 156         * Active close step 2.
 157         */
 158        if (conn->tcp_state == TCPS_FIN_WAIT_1) {
 159            /*
 160             * For simplify implementation, we needn't wait 2MSL time
 161             * in filter rewriter. Because guest kernel will track the
 162             * TCP status and wait 2MSL time, if client resend the FIN
 163             * packet, guest will apply the last ACK too.
 164             * So, we skip the TCPS_TIME_WAIT state here and go straight
 165             * to TCPS_CLOSED state.
 166             */
 167            conn->tcp_state = TCPS_CLOSED;
 168            g_hash_table_remove(rf->connection_track_table, key);
 169        }
 170    }
 171
 172    return 0;
 173}
 174
 175/* handle tcp packet from secondary guest */
 176static int handle_secondary_tcp_pkt(RewriterState *rf,
 177                                    Connection *conn,
 178                                    Packet *pkt, ConnectionKey *key)
 179{
 180    struct tcp_hdr *tcp_pkt;
 181
 182    tcp_pkt = (struct tcp_hdr *)pkt->transport_header;
 183
 184    if (trace_event_get_state_backends(TRACE_COLO_FILTER_REWRITER_DEBUG)) {
 185        trace_colo_filter_rewriter_pkt_info(__func__,
 186                    inet_ntoa(pkt->ip->ip_src), inet_ntoa(pkt->ip->ip_dst),
 187                    ntohl(tcp_pkt->th_seq), ntohl(tcp_pkt->th_ack),
 188                    tcp_pkt->th_flags);
 189        trace_colo_filter_rewriter_conn_offset(conn->offset);
 190    }
 191
 192    if (conn->tcp_state == TCPS_SYN_RECEIVED &&
 193        ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == (TH_ACK | TH_SYN))) {
 194        /*
 195         * save offset = secondary_seq and then
 196         * in handle_primary_tcp_pkt make offset
 197         * = secondary_seq - primary_seq
 198         */
 199        conn->offset = ntohl(tcp_pkt->th_seq);
 200    }
 201
 202    /* VM active connect */
 203    if (conn->tcp_state == TCPS_CLOSED &&
 204        ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_SYN)) {
 205        conn->tcp_state = TCPS_SYN_SENT;
 206    }
 207
 208    if ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_ACK) {
 209        /* Only need to adjust seq while offset is Non-zero */
 210        if (conn->offset) {
 211            /* handle packets to the primary from the secondary*/
 212            tcp_pkt->th_seq = htonl(ntohl(tcp_pkt->th_seq) - conn->offset);
 213
 214            net_checksum_calculate((uint8_t *)pkt->data + pkt->vnet_hdr_len,
 215                                   pkt->size - pkt->vnet_hdr_len);
 216        }
 217    }
 218
 219    /*
 220     * Passive close step 2:
 221     */
 222    if (conn->tcp_state == TCPS_CLOSE_WAIT &&
 223        (tcp_pkt->th_flags & (TH_ACK | TH_FIN)) == (TH_ACK | TH_FIN)) {
 224        conn->fin_ack_seq = ntohl(tcp_pkt->th_seq);
 225        conn->tcp_state = TCPS_LAST_ACK;
 226    }
 227
 228    /*
 229     * Active close
 230     *
 231     * Step 1:
 232     * The *server* side of this connect is VM, *server* tries to close
 233     * the connection.
 234     *
 235     * Step 2:
 236     * We will into CLOSE_WAIT status.
 237     * We simplify the TCPS_FIN_WAIT_2, TCPS_TIME_WAIT and
 238     * CLOSING status.
 239     */
 240    if (conn->tcp_state == TCPS_ESTABLISHED &&
 241        (tcp_pkt->th_flags & (TH_ACK | TH_FIN)) == TH_FIN) {
 242        conn->tcp_state = TCPS_FIN_WAIT_1;
 243    }
 244
 245    return 0;
 246}
 247
 248static ssize_t colo_rewriter_receive_iov(NetFilterState *nf,
 249                                         NetClientState *sender,
 250                                         unsigned flags,
 251                                         const struct iovec *iov,
 252                                         int iovcnt,
 253                                         NetPacketSent *sent_cb)
 254{
 255    RewriterState *s = FILTER_COLO_REWRITER(nf);
 256    Connection *conn;
 257    ConnectionKey key;
 258    Packet *pkt;
 259    ssize_t size = iov_size(iov, iovcnt);
 260    ssize_t vnet_hdr_len = 0;
 261    char *buf = g_malloc0(size);
 262
 263    iov_to_buf(iov, iovcnt, 0, buf, size);
 264
 265    if (s->vnet_hdr) {
 266        vnet_hdr_len = nf->netdev->vnet_hdr_len;
 267    }
 268
 269    pkt = packet_new(buf, size, vnet_hdr_len);
 270    g_free(buf);
 271
 272    /*
 273     * if we get tcp packet
 274     * we will rewrite it to make secondary guest's
 275     * connection established successfully
 276     */
 277    if (pkt && is_tcp_packet(pkt)) {
 278
 279        fill_connection_key(pkt, &key);
 280
 281        if (sender == nf->netdev) {
 282            /*
 283             * We need make tcp TX and RX packet
 284             * into one connection.
 285             */
 286            reverse_connection_key(&key);
 287        }
 288
 289        /* After failover we needn't change new TCP packet */
 290        if (s->failover_mode &&
 291            !connection_has_tracked(s->connection_track_table, &key)) {
 292            goto out;
 293        }
 294
 295        conn = connection_get(s->connection_track_table,
 296                              &key,
 297                              NULL);
 298
 299        if (sender == nf->netdev) {
 300            /* NET_FILTER_DIRECTION_TX */
 301            if (!handle_primary_tcp_pkt(s, conn, pkt, &key)) {
 302                qemu_net_queue_send(s->incoming_queue, sender, 0,
 303                (const uint8_t *)pkt->data, pkt->size, NULL);
 304                packet_destroy(pkt, NULL);
 305                pkt = NULL;
 306                /*
 307                 * We block the packet here,after rewrite pkt
 308                 * and will send it
 309                 */
 310                return 1;
 311            }
 312        } else {
 313            /* NET_FILTER_DIRECTION_RX */
 314            if (!handle_secondary_tcp_pkt(s, conn, pkt, &key)) {
 315                qemu_net_queue_send(s->incoming_queue, sender, 0,
 316                (const uint8_t *)pkt->data, pkt->size, NULL);
 317                packet_destroy(pkt, NULL);
 318                pkt = NULL;
 319                /*
 320                 * We block the packet here,after rewrite pkt
 321                 * and will send it
 322                 */
 323                return 1;
 324            }
 325        }
 326    }
 327
 328out:
 329    packet_destroy(pkt, NULL);
 330    pkt = NULL;
 331    return 0;
 332}
 333
 334static void reset_seq_offset(gpointer key, gpointer value, gpointer user_data)
 335{
 336    Connection *conn = (Connection *)value;
 337
 338    conn->offset = 0;
 339}
 340
 341static gboolean offset_is_nonzero(gpointer key,
 342                                  gpointer value,
 343                                  gpointer user_data)
 344{
 345    Connection *conn = (Connection *)value;
 346
 347    return conn->offset ? true : false;
 348}
 349
 350static void colo_rewriter_handle_event(NetFilterState *nf, int event,
 351                                       Error **errp)
 352{
 353    RewriterState *rs = FILTER_COLO_REWRITER(nf);
 354
 355    switch (event) {
 356    case COLO_EVENT_CHECKPOINT:
 357        g_hash_table_foreach(rs->connection_track_table,
 358                            reset_seq_offset, NULL);
 359        break;
 360    case COLO_EVENT_FAILOVER:
 361        if (!g_hash_table_find(rs->connection_track_table,
 362                              offset_is_nonzero, NULL)) {
 363            filter_rewriter_failover_mode(rs);
 364        }
 365        break;
 366    default:
 367        break;
 368    }
 369}
 370
 371static void colo_rewriter_cleanup(NetFilterState *nf)
 372{
 373    RewriterState *s = FILTER_COLO_REWRITER(nf);
 374
 375    /* flush packets */
 376    if (s->incoming_queue) {
 377        filter_rewriter_flush(nf);
 378        g_free(s->incoming_queue);
 379    }
 380}
 381
 382static void colo_rewriter_setup(NetFilterState *nf, Error **errp)
 383{
 384    RewriterState *s = FILTER_COLO_REWRITER(nf);
 385
 386    s->connection_track_table = g_hash_table_new_full(connection_key_hash,
 387                                                      connection_key_equal,
 388                                                      g_free,
 389                                                      connection_destroy);
 390    s->incoming_queue = qemu_new_net_queue(qemu_netfilter_pass_to_next, nf);
 391}
 392
 393static bool filter_rewriter_get_vnet_hdr(Object *obj, Error **errp)
 394{
 395    RewriterState *s = FILTER_COLO_REWRITER(obj);
 396
 397    return s->vnet_hdr;
 398}
 399
 400static void filter_rewriter_set_vnet_hdr(Object *obj,
 401                                         bool value,
 402                                         Error **errp)
 403{
 404    RewriterState *s = FILTER_COLO_REWRITER(obj);
 405
 406    s->vnet_hdr = value;
 407}
 408
 409static void filter_rewriter_init(Object *obj)
 410{
 411    RewriterState *s = FILTER_COLO_REWRITER(obj);
 412
 413    s->vnet_hdr = false;
 414    s->failover_mode = FAILOVER_MODE_OFF;
 415    object_property_add_bool(obj, "vnet_hdr_support",
 416                             filter_rewriter_get_vnet_hdr,
 417                             filter_rewriter_set_vnet_hdr, NULL);
 418}
 419
 420static void colo_rewriter_class_init(ObjectClass *oc, void *data)
 421{
 422    NetFilterClass *nfc = NETFILTER_CLASS(oc);
 423
 424    nfc->setup = colo_rewriter_setup;
 425    nfc->cleanup = colo_rewriter_cleanup;
 426    nfc->receive_iov = colo_rewriter_receive_iov;
 427    nfc->handle_event = colo_rewriter_handle_event;
 428}
 429
 430static const TypeInfo colo_rewriter_info = {
 431    .name = TYPE_FILTER_REWRITER,
 432    .parent = TYPE_NETFILTER,
 433    .class_init = colo_rewriter_class_init,
 434    .instance_init = filter_rewriter_init,
 435    .instance_size = sizeof(RewriterState),
 436};
 437
 438static void register_types(void)
 439{
 440    type_register_static(&colo_rewriter_info);
 441}
 442
 443type_init(register_types);
 444