linux/arch/x86/kernel/kvm.c
<<
>>
Prefs
   1/*
   2 * KVM paravirt_ops implementation
   3 *
   4 * This program is free software; you can redistribute it and/or modify
   5 * it under the terms of the GNU General Public License as published by
   6 * the Free Software Foundation; either version 2 of the License, or
   7 * (at your option) any later version.
   8 *
   9 * This program is distributed in the hope that it will be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write to the Free Software
  16 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  17 *
  18 * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  19 * Copyright IBM Corporation, 2007
  20 *   Authors: Anthony Liguori <aliguori@us.ibm.com>
  21 */
  22
  23#include <linux/module.h>
  24#include <linux/kernel.h>
  25#include <linux/kvm_para.h>
  26#include <linux/cpu.h>
  27#include <linux/mm.h>
  28#include <linux/highmem.h>
  29#include <linux/hardirq.h>
  30#include <linux/notifier.h>
  31#include <linux/reboot.h>
  32#include <linux/hash.h>
  33#include <linux/sched.h>
  34#include <linux/slab.h>
  35#include <linux/kprobes.h>
  36#include <asm/timer.h>
  37#include <asm/cpu.h>
  38#include <asm/traps.h>
  39#include <asm/desc.h>
  40#include <asm/tlbflush.h>
  41#include <asm/idle.h>
  42#include <asm/apic.h>
  43#include <asm/apicdef.h>
  44#include <asm/hypervisor.h>
  45
  46static int kvmapf = 1;
  47
  48static int parse_no_kvmapf(char *arg)
  49{
  50        kvmapf = 0;
  51        return 0;
  52}
  53
  54early_param("no-kvmapf", parse_no_kvmapf);
  55
  56static int steal_acc = 1;
  57static int parse_no_stealacc(char *arg)
  58{
  59        steal_acc = 0;
  60        return 0;
  61}
  62
  63early_param("no-steal-acc", parse_no_stealacc);
  64
  65static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
  66static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
  67static int has_steal_clock = 0;
  68
  69/*
  70 * No need for any "IO delay" on KVM
  71 */
  72static void kvm_io_delay(void)
  73{
  74}
  75
  76#define KVM_TASK_SLEEP_HASHBITS 8
  77#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
  78
  79struct kvm_task_sleep_node {
  80        struct hlist_node link;
  81        wait_queue_head_t wq;
  82        u32 token;
  83        int cpu;
  84        bool halted;
  85};
  86
  87static struct kvm_task_sleep_head {
  88        spinlock_t lock;
  89        struct hlist_head list;
  90} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
  91
  92static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
  93                                                  u32 token)
  94{
  95        struct hlist_node *p;
  96
  97        hlist_for_each(p, &b->list) {
  98                struct kvm_task_sleep_node *n =
  99                        hlist_entry(p, typeof(*n), link);
 100                if (n->token == token)
 101                        return n;
 102        }
 103
 104        return NULL;
 105}
 106
 107void kvm_async_pf_task_wait(u32 token)
 108{
 109        u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
 110        struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
 111        struct kvm_task_sleep_node n, *e;
 112        DEFINE_WAIT(wait);
 113        int cpu, idle;
 114
 115        cpu = get_cpu();
 116        idle = idle_cpu(cpu);
 117        put_cpu();
 118
 119        spin_lock(&b->lock);
 120        e = _find_apf_task(b, token);
 121        if (e) {
 122                /* dummy entry exist -> wake up was delivered ahead of PF */
 123                hlist_del(&e->link);
 124                kfree(e);
 125                spin_unlock(&b->lock);
 126                return;
 127        }
 128
 129        n.token = token;
 130        n.cpu = smp_processor_id();
 131        n.halted = idle || preempt_count() > 1;
 132        init_waitqueue_head(&n.wq);
 133        hlist_add_head(&n.link, &b->list);
 134        spin_unlock(&b->lock);
 135
 136        for (;;) {
 137                if (!n.halted)
 138                        prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
 139                if (hlist_unhashed(&n.link))
 140                        break;
 141
 142                if (!n.halted) {
 143                        local_irq_enable();
 144                        schedule();
 145                        local_irq_disable();
 146                } else {
 147                        /*
 148                         * We cannot reschedule. So halt.
 149                         */
 150                        native_safe_halt();
 151                        local_irq_disable();
 152                }
 153        }
 154        if (!n.halted)
 155                finish_wait(&n.wq, &wait);
 156
 157        return;
 158}
 159EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
 160
 161static void apf_task_wake_one(struct kvm_task_sleep_node *n)
 162{
 163        hlist_del_init(&n->link);
 164        if (n->halted)
 165                smp_send_reschedule(n->cpu);
 166        else if (waitqueue_active(&n->wq))
 167                wake_up(&n->wq);
 168}
 169
 170static void apf_task_wake_all(void)
 171{
 172        int i;
 173
 174        for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
 175                struct hlist_node *p, *next;
 176                struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
 177                spin_lock(&b->lock);
 178                hlist_for_each_safe(p, next, &b->list) {
 179                        struct kvm_task_sleep_node *n =
 180                                hlist_entry(p, typeof(*n), link);
 181                        if (n->cpu == smp_processor_id())
 182                                apf_task_wake_one(n);
 183                }
 184                spin_unlock(&b->lock);
 185        }
 186}
 187
 188void kvm_async_pf_task_wake(u32 token)
 189{
 190        u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
 191        struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
 192        struct kvm_task_sleep_node *n;
 193
 194        if (token == ~0) {
 195                apf_task_wake_all();
 196                return;
 197        }
 198
 199again:
 200        spin_lock(&b->lock);
 201        n = _find_apf_task(b, token);
 202        if (!n) {
 203                /*
 204                 * async PF was not yet handled.
 205                 * Add dummy entry for the token.
 206                 */
 207                n = kzalloc(sizeof(*n), GFP_ATOMIC);
 208                if (!n) {
 209                        /*
 210                         * Allocation failed! Busy wait while other cpu
 211                         * handles async PF.
 212                         */
 213                        spin_unlock(&b->lock);
 214                        cpu_relax();
 215                        goto again;
 216                }
 217                n->token = token;
 218                n->cpu = smp_processor_id();
 219                init_waitqueue_head(&n->wq);
 220                hlist_add_head(&n->link, &b->list);
 221        } else
 222                apf_task_wake_one(n);
 223        spin_unlock(&b->lock);
 224        return;
 225}
 226EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
 227
 228u32 kvm_read_and_reset_pf_reason(void)
 229{
 230        u32 reason = 0;
 231
 232        if (__get_cpu_var(apf_reason).enabled) {
 233                reason = __get_cpu_var(apf_reason).reason;
 234                __get_cpu_var(apf_reason).reason = 0;
 235        }
 236
 237        return reason;
 238}
 239EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
 240
 241dotraplinkage void __kprobes
 242do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
 243{
 244        switch (kvm_read_and_reset_pf_reason()) {
 245        default:
 246                do_page_fault(regs, error_code);
 247                break;
 248        case KVM_PV_REASON_PAGE_NOT_PRESENT:
 249                /* page is swapped out by the host. */
 250                rcu_irq_enter();
 251                exit_idle();
 252                kvm_async_pf_task_wait((u32)read_cr2());
 253                rcu_irq_exit();
 254                break;
 255        case KVM_PV_REASON_PAGE_READY:
 256                rcu_irq_enter();
 257                exit_idle();
 258                kvm_async_pf_task_wake((u32)read_cr2());
 259                rcu_irq_exit();
 260                break;
 261        }
 262}
 263
 264static void __init paravirt_ops_setup(void)
 265{
 266        pv_info.name = "KVM";
 267        pv_info.paravirt_enabled = 1;
 268
 269        if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
 270                pv_cpu_ops.io_delay = kvm_io_delay;
 271
 272#ifdef CONFIG_X86_IO_APIC
 273        no_timer_check = 1;
 274#endif
 275}
 276
 277static void kvm_register_steal_time(void)
 278{
 279        int cpu = smp_processor_id();
 280        struct kvm_steal_time *st = &per_cpu(steal_time, cpu);
 281
 282        if (!has_steal_clock)
 283                return;
 284
 285        memset(st, 0, sizeof(*st));
 286
 287        wrmsrl(MSR_KVM_STEAL_TIME, (__pa(st) | KVM_MSR_ENABLED));
 288        printk(KERN_INFO "kvm-stealtime: cpu %d, msr %lx\n",
 289                cpu, __pa(st));
 290}
 291
 292static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
 293
 294static void kvm_guest_apic_eoi_write(u32 reg, u32 val)
 295{
 296        /**
 297         * This relies on __test_and_clear_bit to modify the memory
 298         * in a way that is atomic with respect to the local CPU.
 299         * The hypervisor only accesses this memory from the local CPU so
 300         * there's no need for lock or memory barriers.
 301         * An optimization barrier is implied in apic write.
 302         */
 303        if (__test_and_clear_bit(KVM_PV_EOI_BIT, &__get_cpu_var(kvm_apic_eoi)))
 304                return;
 305        apic_write(APIC_EOI, APIC_EOI_ACK);
 306}
 307
 308void __cpuinit kvm_guest_cpu_init(void)
 309{
 310        if (!kvm_para_available())
 311                return;
 312
 313        if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
 314                u64 pa = __pa(&__get_cpu_var(apf_reason));
 315
 316#ifdef CONFIG_PREEMPT
 317                pa |= KVM_ASYNC_PF_SEND_ALWAYS;
 318#endif
 319                wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED);
 320                __get_cpu_var(apf_reason).enabled = 1;
 321                printk(KERN_INFO"KVM setup async PF for cpu %d\n",
 322                       smp_processor_id());
 323        }
 324
 325        if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
 326                unsigned long pa;
 327                /* Size alignment is implied but just to make it explicit. */
 328                BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
 329                __get_cpu_var(kvm_apic_eoi) = 0;
 330                pa = __pa(&__get_cpu_var(kvm_apic_eoi)) | KVM_MSR_ENABLED;
 331                wrmsrl(MSR_KVM_PV_EOI_EN, pa);
 332        }
 333
 334        if (has_steal_clock)
 335                kvm_register_steal_time();
 336}
 337
 338static void kvm_pv_disable_apf(void)
 339{
 340        if (!__get_cpu_var(apf_reason).enabled)
 341                return;
 342
 343        wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
 344        __get_cpu_var(apf_reason).enabled = 0;
 345
 346        printk(KERN_INFO"Unregister pv shared memory for cpu %d\n",
 347               smp_processor_id());
 348}
 349
 350static void kvm_pv_guest_cpu_reboot(void *unused)
 351{
 352        /*
 353         * We disable PV EOI before we load a new kernel by kexec,
 354         * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory.
 355         * New kernel can re-enable when it boots.
 356         */
 357        if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
 358                wrmsrl(MSR_KVM_PV_EOI_EN, 0);
 359        kvm_pv_disable_apf();
 360        kvm_disable_steal_time();
 361}
 362
 363static int kvm_pv_reboot_notify(struct notifier_block *nb,
 364                                unsigned long code, void *unused)
 365{
 366        if (code == SYS_RESTART)
 367                on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
 368        return NOTIFY_DONE;
 369}
 370
 371static struct notifier_block kvm_pv_reboot_nb = {
 372        .notifier_call = kvm_pv_reboot_notify,
 373};
 374
 375static u64 kvm_steal_clock(int cpu)
 376{
 377        u64 steal;
 378        struct kvm_steal_time *src;
 379        int version;
 380
 381        src = &per_cpu(steal_time, cpu);
 382        do {
 383                version = src->version;
 384                rmb();
 385                steal = src->steal;
 386                rmb();
 387        } while ((version & 1) || (version != src->version));
 388
 389        return steal;
 390}
 391
 392void kvm_disable_steal_time(void)
 393{
 394        if (!has_steal_clock)
 395                return;
 396
 397        wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
 398}
 399
 400#ifdef CONFIG_SMP
 401static void __init kvm_smp_prepare_boot_cpu(void)
 402{
 403        WARN_ON(kvm_register_clock("primary cpu clock"));
 404        kvm_guest_cpu_init();
 405        native_smp_prepare_boot_cpu();
 406}
 407
 408static void __cpuinit kvm_guest_cpu_online(void *dummy)
 409{
 410        kvm_guest_cpu_init();
 411}
 412
 413static void kvm_guest_cpu_offline(void *dummy)
 414{
 415        kvm_disable_steal_time();
 416        if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
 417                wrmsrl(MSR_KVM_PV_EOI_EN, 0);
 418        kvm_pv_disable_apf();
 419        apf_task_wake_all();
 420}
 421
 422static int __cpuinit kvm_cpu_notify(struct notifier_block *self,
 423                                    unsigned long action, void *hcpu)
 424{
 425        int cpu = (unsigned long)hcpu;
 426        switch (action) {
 427        case CPU_ONLINE:
 428        case CPU_DOWN_FAILED:
 429        case CPU_ONLINE_FROZEN:
 430                smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0);
 431                break;
 432        case CPU_DOWN_PREPARE:
 433        case CPU_DOWN_PREPARE_FROZEN:
 434                smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1);
 435                break;
 436        default:
 437                break;
 438        }
 439        return NOTIFY_OK;
 440}
 441
 442static struct notifier_block __cpuinitdata kvm_cpu_notifier = {
 443        .notifier_call  = kvm_cpu_notify,
 444};
 445#endif
 446
 447static void __init kvm_apf_trap_init(void)
 448{
 449        set_intr_gate(14, &async_page_fault);
 450}
 451
 452void __init kvm_guest_init(void)
 453{
 454        int i;
 455
 456        if (!kvm_para_available())
 457                return;
 458
 459        paravirt_ops_setup();
 460        register_reboot_notifier(&kvm_pv_reboot_nb);
 461        for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
 462                spin_lock_init(&async_pf_sleepers[i].lock);
 463        if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
 464                x86_init.irqs.trap_init = kvm_apf_trap_init;
 465
 466        if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
 467                has_steal_clock = 1;
 468                pv_time_ops.steal_clock = kvm_steal_clock;
 469        }
 470
 471        if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
 472                apic_set_eoi_write(kvm_guest_apic_eoi_write);
 473
 474#ifdef CONFIG_SMP
 475        smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
 476        register_cpu_notifier(&kvm_cpu_notifier);
 477#else
 478        kvm_guest_cpu_init();
 479#endif
 480}
 481
 482static bool __init kvm_detect(void)
 483{
 484        if (!kvm_para_available())
 485                return false;
 486        return true;
 487}
 488
 489const struct hypervisor_x86 x86_hyper_kvm __refconst = {
 490        .name                   = "KVM",
 491        .detect                 = kvm_detect,
 492};
 493EXPORT_SYMBOL_GPL(x86_hyper_kvm);
 494
 495static __init int activate_jump_labels(void)
 496{
 497        if (has_steal_clock) {
 498                static_key_slow_inc(&paravirt_steal_enabled);
 499                if (steal_acc)
 500                        static_key_slow_inc(&paravirt_steal_rq_enabled);
 501        }
 502
 503        return 0;
 504}
 505arch_initcall(activate_jump_labels);
 506