LXR linux/arch/x86/kernel/kvm.c

   1/*
   2 * KVM paravirt_ops implementation
   3 *
   4 * This program is free software; you can redistribute it and/or modify
   5 * it under the terms of the GNU General Public License as published by
   6 * the Free Software Foundation; either version 2 of the License, or
   7 * (at your option) any later version.
   8 *
   9 * This program is distributed in the hope that it will be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write to the Free Software
  16 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  17 *
  18 * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  19 * Copyright IBM Corporation, 2007
  20 *   Authors: Anthony Liguori <aliguori@us.ibm.com>
  21 */
  22
  23#include <linux/module.h>
  24#include <linux/kernel.h>
  25#include <linux/kvm_para.h>
  26#include <linux/cpu.h>
  27#include <linux/mm.h>
  28#include <linux/highmem.h>
  29#include <linux/hardirq.h>
  30#include <linux/notifier.h>
  31#include <linux/reboot.h>
  32#include <linux/hash.h>
  33#include <linux/sched.h>
  34#include <linux/slab.h>
  35#include <linux/kprobes.h>
  36#include <asm/timer.h>
  37#include <asm/cpu.h>
  38#include <asm/traps.h>
  39#include <asm/desc.h>
  40#include <asm/tlbflush.h>
  41
  42#define MMU_QUEUE_SIZE 1024
  43
  44static int kvmapf = 1;
  45
  46static int parse_no_kvmapf(char *arg)
  47{
  48        kvmapf = 0;
  49        return 0;
  50}
  51
  52early_param("no-kvmapf", parse_no_kvmapf);
  53
  54struct kvm_para_state {
  55        u8 mmu_queue[MMU_QUEUE_SIZE];
  56        int mmu_queue_len;
  57};
  58
  59static DEFINE_PER_CPU(struct kvm_para_state, para_state);
  60static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
  61
  62static struct kvm_para_state *kvm_para_state(void)
  63{
  64        return &per_cpu(para_state, raw_smp_processor_id());
  65}
  66
  67/*
  68 * No need for any "IO delay" on KVM
  69 */
  70static void kvm_io_delay(void)
  71{
  72}
  73
  74#define KVM_TASK_SLEEP_HASHBITS 8
  75#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
  76
  77struct kvm_task_sleep_node {
  78        struct hlist_node link;
  79        wait_queue_head_t wq;
  80        u32 token;
  81        int cpu;
  82        bool halted;
  83        struct mm_struct *mm;
  84};
  85
  86static struct kvm_task_sleep_head {
  87        spinlock_t lock;
  88        struct hlist_head list;
  89} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
  90
  91static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
  92                                                  u32 token)
  93{
  94        struct hlist_node *p;
  95
  96        hlist_for_each(p, &b->list) {
  97                struct kvm_task_sleep_node *n =
  98                        hlist_entry(p, typeof(*n), link);
  99                if (n->token == token)
 100                        return n;
 101        }
 102
 103        return NULL;
 104}
 105
 106void kvm_async_pf_task_wait(u32 token)
 107{
 108        u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
 109        struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
 110        struct kvm_task_sleep_node n, *e;
 111        DEFINE_WAIT(wait);
 112        int cpu, idle;
 113
 114        cpu = get_cpu();
 115        idle = idle_cpu(cpu);
 116        put_cpu();
 117
 118        spin_lock(&b->lock);
 119        e = _find_apf_task(b, token);
 120        if (e) {
 121                /* dummy entry exist -> wake up was delivered ahead of PF */
 122                hlist_del(&e->link);
 123                kfree(e);
 124                spin_unlock(&b->lock);
 125                return;
 126        }
 127
 128        n.token = token;
 129        n.cpu = smp_processor_id();
 130        n.mm = current->active_mm;
 131        n.halted = idle || preempt_count() > 1;
 132        atomic_inc(&n.mm->mm_count);
 133        init_waitqueue_head(&n.wq);
 134        hlist_add_head(&n.link, &b->list);
 135        spin_unlock(&b->lock);
 136
 137        for (;;) {
 138                if (!n.halted)
 139                        prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
 140                if (hlist_unhashed(&n.link))
 141                        break;
 142
 143                if (!n.halted) {
 144                        local_irq_enable();
 145                        schedule();
 146                        local_irq_disable();
 147                } else {
 148                        /*
 149                         * We cannot reschedule. So halt.
 150                         */
 151                        native_safe_halt();
 152                        local_irq_disable();
 153                }
 154        }
 155        if (!n.halted)
 156                finish_wait(&n.wq, &wait);
 157
 158        return;
 159}
 160EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
 161
 162static void apf_task_wake_one(struct kvm_task_sleep_node *n)
 163{
 164        hlist_del_init(&n->link);
 165        if (!n->mm)
 166                return;
 167        mmdrop(n->mm);
 168        if (n->halted)
 169                smp_send_reschedule(n->cpu);
 170        else if (waitqueue_active(&n->wq))
 171                wake_up(&n->wq);
 172}
 173
 174static void apf_task_wake_all(void)
 175{
 176        int i;
 177
 178        for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
 179                struct hlist_node *p, *next;
 180                struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
 181                spin_lock(&b->lock);
 182                hlist_for_each_safe(p, next, &b->list) {
 183                        struct kvm_task_sleep_node *n =
 184                                hlist_entry(p, typeof(*n), link);
 185                        if (n->cpu == smp_processor_id())
 186                                apf_task_wake_one(n);
 187                }
 188                spin_unlock(&b->lock);
 189        }
 190}
 191
 192void kvm_async_pf_task_wake(u32 token)
 193{
 194        u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
 195        struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
 196        struct kvm_task_sleep_node *n;
 197
 198        if (token == ~0) {
 199                apf_task_wake_all();
 200                return;
 201        }
 202
 203again:
 204        spin_lock(&b->lock);
 205        n = _find_apf_task(b, token);
 206        if (!n) {
 207                /*
 208                 * async PF was not yet handled.
 209                 * Add dummy entry for the token.
 210                 */
 211                n = kmalloc(sizeof(*n), GFP_ATOMIC);
 212                if (!n) {
 213                        /*
 214                         * Allocation failed! Busy wait while other cpu
 215                         * handles async PF.
 216                         */
 217                        spin_unlock(&b->lock);
 218                        cpu_relax();
 219                        goto again;
 220                }
 221                n->token = token;
 222                n->cpu = smp_processor_id();
 223                n->mm = NULL;
 224                init_waitqueue_head(&n->wq);
 225                hlist_add_head(&n->link, &b->list);
 226        } else
 227                apf_task_wake_one(n);
 228        spin_unlock(&b->lock);
 229        return;
 230}
 231EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
 232
 233u32 kvm_read_and_reset_pf_reason(void)
 234{
 235        u32 reason = 0;
 236
 237        if (__get_cpu_var(apf_reason).enabled) {
 238                reason = __get_cpu_var(apf_reason).reason;
 239                __get_cpu_var(apf_reason).reason = 0;
 240        }
 241
 242        return reason;
 243}
 244EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
 245
 246dotraplinkage void __kprobes
 247do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
 248{
 249        switch (kvm_read_and_reset_pf_reason()) {
 250        default:
 251                do_page_fault(regs, error_code);
 252                break;
 253        case KVM_PV_REASON_PAGE_NOT_PRESENT:
 254                /* page is swapped out by the host. */
 255                kvm_async_pf_task_wait((u32)read_cr2());
 256                break;
 257        case KVM_PV_REASON_PAGE_READY:
 258                kvm_async_pf_task_wake((u32)read_cr2());
 259                break;
 260        }
 261}
 262
 263static void kvm_mmu_op(void *buffer, unsigned len)
 264{
 265        int r;
 266        unsigned long a1, a2;
 267
 268        do {
 269                a1 = __pa(buffer);
 270                a2 = 0;   /* on i386 __pa() always returns <4G */
 271                r = kvm_hypercall3(KVM_HC_MMU_OP, len, a1, a2);
 272                buffer += r;
 273                len -= r;
 274        } while (len);
 275}
 276
 277static void mmu_queue_flush(struct kvm_para_state *state)
 278{
 279        if (state->mmu_queue_len) {
 280                kvm_mmu_op(state->mmu_queue, state->mmu_queue_len);
 281                state->mmu_queue_len = 0;
 282        }
 283}
 284
 285static void kvm_deferred_mmu_op(void *buffer, int len)
 286{
 287        struct kvm_para_state *state = kvm_para_state();
 288
 289        if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) {
 290                kvm_mmu_op(buffer, len);
 291                return;
 292        }
 293        if (state->mmu_queue_len + len > sizeof state->mmu_queue)
 294                mmu_queue_flush(state);
 295        memcpy(state->mmu_queue + state->mmu_queue_len, buffer, len);
 296        state->mmu_queue_len += len;
 297}
 298
 299static void kvm_mmu_write(void *dest, u64 val)
 300{
 301        __u64 pte_phys;
 302        struct kvm_mmu_op_write_pte wpte;
 303
 304#ifdef CONFIG_HIGHPTE
 305        struct page *page;
 306        unsigned long dst = (unsigned long) dest;
 307
 308        page = kmap_atomic_to_page(dest);
 309        pte_phys = page_to_pfn(page);
 310        pte_phys <<= PAGE_SHIFT;
 311        pte_phys += (dst & ~(PAGE_MASK));
 312#else
 313        pte_phys = (unsigned long)__pa(dest);
 314#endif
 315        wpte.header.op = KVM_MMU_OP_WRITE_PTE;
 316        wpte.pte_val = val;
 317        wpte.pte_phys = pte_phys;
 318
 319        kvm_deferred_mmu_op(&wpte, sizeof wpte);
 320}
 321
 322/*
 323 * We only need to hook operations that are MMU writes.  We hook these so that
 324 * we can use lazy MMU mode to batch these operations.  We could probably
 325 * improve the performance of the host code if we used some of the information
 326 * here to simplify processing of batched writes.
 327 */
 328static void kvm_set_pte(pte_t *ptep, pte_t pte)
 329{
 330        kvm_mmu_write(ptep, pte_val(pte));
 331}
 332
 333static void kvm_set_pte_at(struct mm_struct *mm, unsigned long addr,
 334                           pte_t *ptep, pte_t pte)
 335{
 336        kvm_mmu_write(ptep, pte_val(pte));
 337}
 338
 339static void kvm_set_pmd(pmd_t *pmdp, pmd_t pmd)
 340{
 341        kvm_mmu_write(pmdp, pmd_val(pmd));
 342}
 343
 344#if PAGETABLE_LEVELS >= 3
 345#ifdef CONFIG_X86_PAE
 346static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte)
 347{
 348        kvm_mmu_write(ptep, pte_val(pte));
 349}
 350
 351static void kvm_pte_clear(struct mm_struct *mm,
 352                          unsigned long addr, pte_t *ptep)
 353{
 354        kvm_mmu_write(ptep, 0);
 355}
 356
 357static void kvm_pmd_clear(pmd_t *pmdp)
 358{
 359        kvm_mmu_write(pmdp, 0);
 360}
 361#endif
 362
 363static void kvm_set_pud(pud_t *pudp, pud_t pud)
 364{
 365        kvm_mmu_write(pudp, pud_val(pud));
 366}
 367
 368#if PAGETABLE_LEVELS == 4
 369static void kvm_set_pgd(pgd_t *pgdp, pgd_t pgd)
 370{
 371        kvm_mmu_write(pgdp, pgd_val(pgd));
 372}
 373#endif
 374#endif /* PAGETABLE_LEVELS >= 3 */
 375
 376static void kvm_flush_tlb(void)
 377{
 378        struct kvm_mmu_op_flush_tlb ftlb = {
 379                .header.op = KVM_MMU_OP_FLUSH_TLB,
 380        };
 381
 382        kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
 383}
 384
 385static void kvm_release_pt(unsigned long pfn)
 386{
 387        struct kvm_mmu_op_release_pt rpt = {
 388                .header.op = KVM_MMU_OP_RELEASE_PT,
 389                .pt_phys = (u64)pfn << PAGE_SHIFT,
 390        };
 391
 392        kvm_mmu_op(&rpt, sizeof rpt);
 393}
 394
 395static void kvm_enter_lazy_mmu(void)
 396{
 397        paravirt_enter_lazy_mmu();
 398}
 399
 400static void kvm_leave_lazy_mmu(void)
 401{
 402        struct kvm_para_state *state = kvm_para_state();
 403
 404        mmu_queue_flush(state);
 405        paravirt_leave_lazy_mmu();
 406}
 407
 408static void __init paravirt_ops_setup(void)
 409{
 410        pv_info.name = "KVM";
 411        pv_info.paravirt_enabled = 1;
 412
 413        if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
 414                pv_cpu_ops.io_delay = kvm_io_delay;
 415
 416        if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) {
 417                pv_mmu_ops.set_pte = kvm_set_pte;
 418                pv_mmu_ops.set_pte_at = kvm_set_pte_at;
 419                pv_mmu_ops.set_pmd = kvm_set_pmd;
 420#if PAGETABLE_LEVELS >= 3
 421#ifdef CONFIG_X86_PAE
 422                pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic;
 423                pv_mmu_ops.pte_clear = kvm_pte_clear;
 424                pv_mmu_ops.pmd_clear = kvm_pmd_clear;
 425#endif
 426                pv_mmu_ops.set_pud = kvm_set_pud;
 427#if PAGETABLE_LEVELS == 4
 428                pv_mmu_ops.set_pgd = kvm_set_pgd;
 429#endif
 430#endif
 431                pv_mmu_ops.flush_tlb_user = kvm_flush_tlb;
 432                pv_mmu_ops.release_pte = kvm_release_pt;
 433                pv_mmu_ops.release_pmd = kvm_release_pt;
 434                pv_mmu_ops.release_pud = kvm_release_pt;
 435
 436                pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
 437                pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
 438        }
 439#ifdef CONFIG_X86_IO_APIC
 440        no_timer_check = 1;
 441#endif
 442}
 443
 444void __cpuinit kvm_guest_cpu_init(void)
 445{
 446        if (!kvm_para_available())
 447                return;
 448
 449        if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
 450                u64 pa = __pa(&__get_cpu_var(apf_reason));
 451
 452#ifdef CONFIG_PREEMPT
 453                pa |= KVM_ASYNC_PF_SEND_ALWAYS;
 454#endif
 455                wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED);
 456                __get_cpu_var(apf_reason).enabled = 1;
 457                printk(KERN_INFO"KVM setup async PF for cpu %d\n",
 458                       smp_processor_id());
 459        }
 460}
 461
 462static void kvm_pv_disable_apf(void *unused)
 463{
 464        if (!__get_cpu_var(apf_reason).enabled)
 465                return;
 466
 467        wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
 468        __get_cpu_var(apf_reason).enabled = 0;
 469
 470        printk(KERN_INFO"Unregister pv shared memory for cpu %d\n",
 471               smp_processor_id());
 472}
 473
 474static int kvm_pv_reboot_notify(struct notifier_block *nb,
 475                                unsigned long code, void *unused)
 476{
 477        if (code == SYS_RESTART)
 478                on_each_cpu(kvm_pv_disable_apf, NULL, 1);
 479        return NOTIFY_DONE;
 480}
 481
 482static struct notifier_block kvm_pv_reboot_nb = {
 483        .notifier_call = kvm_pv_reboot_notify,
 484};
 485
 486#ifdef CONFIG_SMP
 487static void __init kvm_smp_prepare_boot_cpu(void)
 488{
 489#ifdef CONFIG_KVM_CLOCK
 490        WARN_ON(kvm_register_clock("primary cpu clock"));
 491#endif
 492        kvm_guest_cpu_init();
 493        native_smp_prepare_boot_cpu();
 494}
 495
 496static void kvm_guest_cpu_online(void *dummy)
 497{
 498        kvm_guest_cpu_init();
 499}
 500
 501static void kvm_guest_cpu_offline(void *dummy)
 502{
 503        kvm_pv_disable_apf(NULL);
 504        apf_task_wake_all();
 505}
 506
 507static int __cpuinit kvm_cpu_notify(struct notifier_block *self,
 508                                    unsigned long action, void *hcpu)
 509{
 510        int cpu = (unsigned long)hcpu;
 511        switch (action) {
 512        case CPU_ONLINE:
 513        case CPU_DOWN_FAILED:
 514        case CPU_ONLINE_FROZEN:
 515                smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0);
 516                break;
 517        case CPU_DOWN_PREPARE:
 518        case CPU_DOWN_PREPARE_FROZEN:
 519                smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1);
 520                break;
 521        default:
 522                break;
 523        }
 524        return NOTIFY_OK;
 525}
 526
 527static struct notifier_block __cpuinitdata kvm_cpu_notifier = {
 528        .notifier_call  = kvm_cpu_notify,
 529};
 530#endif
 531
 532static void __init kvm_apf_trap_init(void)
 533{
 534        set_intr_gate(14, &async_page_fault);
 535}
 536
 537void __init kvm_guest_init(void)
 538{
 539        int i;
 540
 541        if (!kvm_para_available())
 542                return;
 543
 544        paravirt_ops_setup();
 545        register_reboot_notifier(&kvm_pv_reboot_nb);
 546        for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
 547                spin_lock_init(&async_pf_sleepers[i].lock);
 548        if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
 549                x86_init.irqs.trap_init = kvm_apf_trap_init;
 550
 551#ifdef CONFIG_SMP
 552        smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
 553        register_cpu_notifier(&kvm_cpu_notifier);
 554#else
 555        kvm_guest_cpu_init();
 556#endif
 557}
 558