linux/arch/x86/kernel/kvm.c
<<
>>
Prefs
   1/*
   2 * KVM paravirt_ops implementation
   3 *
   4 * This program is free software; you can redistribute it and/or modify
   5 * it under the terms of the GNU General Public License as published by
   6 * the Free Software Foundation; either version 2 of the License, or
   7 * (at your option) any later version.
   8 *
   9 * This program is distributed in the hope that it will be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write to the Free Software
  16 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  17 *
  18 * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  19 * Copyright IBM Corporation, 2007
  20 *   Authors: Anthony Liguori <aliguori@us.ibm.com>
  21 */
  22
  23#include <linux/context_tracking.h>
  24#include <linux/module.h>
  25#include <linux/kernel.h>
  26#include <linux/kvm_para.h>
  27#include <linux/cpu.h>
  28#include <linux/mm.h>
  29#include <linux/highmem.h>
  30#include <linux/hardirq.h>
  31#include <linux/notifier.h>
  32#include <linux/reboot.h>
  33#include <linux/hash.h>
  34#include <linux/sched.h>
  35#include <linux/slab.h>
  36#include <linux/kprobes.h>
  37#include <linux/debugfs.h>
  38#include <linux/nmi.h>
  39#include <asm/timer.h>
  40#include <asm/cpu.h>
  41#include <asm/traps.h>
  42#include <asm/desc.h>
  43#include <asm/tlbflush.h>
  44#include <asm/idle.h>
  45#include <asm/apic.h>
  46#include <asm/apicdef.h>
  47#include <asm/hypervisor.h>
  48#include <asm/kvm_guest.h>
  49
  50static int kvmapf = 1;
  51
  52static int parse_no_kvmapf(char *arg)
  53{
  54        kvmapf = 0;
  55        return 0;
  56}
  57
  58early_param("no-kvmapf", parse_no_kvmapf);
  59
  60static int steal_acc = 1;
  61static int parse_no_stealacc(char *arg)
  62{
  63        steal_acc = 0;
  64        return 0;
  65}
  66
  67early_param("no-steal-acc", parse_no_stealacc);
  68
  69static int kvmclock_vsyscall = 1;
  70static int parse_no_kvmclock_vsyscall(char *arg)
  71{
  72        kvmclock_vsyscall = 0;
  73        return 0;
  74}
  75
  76early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
  77
  78static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
  79static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
  80static int has_steal_clock = 0;
  81
  82/*
  83 * No need for any "IO delay" on KVM
  84 */
  85static void kvm_io_delay(void)
  86{
  87}
  88
  89#define KVM_TASK_SLEEP_HASHBITS 8
  90#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
  91
  92struct kvm_task_sleep_node {
  93        struct hlist_node link;
  94        wait_queue_head_t wq;
  95        u32 token;
  96        int cpu;
  97        bool halted;
  98};
  99
 100static struct kvm_task_sleep_head {
 101        spinlock_t lock;
 102        struct hlist_head list;
 103} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
 104
 105static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
 106                                                  u32 token)
 107{
 108        struct hlist_node *p;
 109
 110        hlist_for_each(p, &b->list) {
 111                struct kvm_task_sleep_node *n =
 112                        hlist_entry(p, typeof(*n), link);
 113                if (n->token == token)
 114                        return n;
 115        }
 116
 117        return NULL;
 118}
 119
 120void kvm_async_pf_task_wait(u32 token)
 121{
 122        u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
 123        struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
 124        struct kvm_task_sleep_node n, *e;
 125        DEFINE_WAIT(wait);
 126
 127        rcu_irq_enter();
 128
 129        spin_lock(&b->lock);
 130        e = _find_apf_task(b, token);
 131        if (e) {
 132                /* dummy entry exist -> wake up was delivered ahead of PF */
 133                hlist_del(&e->link);
 134                kfree(e);
 135                spin_unlock(&b->lock);
 136
 137                rcu_irq_exit();
 138                return;
 139        }
 140
 141        n.token = token;
 142        n.cpu = smp_processor_id();
 143        n.halted = is_idle_task(current) || preempt_count() > 1;
 144        init_waitqueue_head(&n.wq);
 145        hlist_add_head(&n.link, &b->list);
 146        spin_unlock(&b->lock);
 147
 148        for (;;) {
 149                if (!n.halted)
 150                        prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
 151                if (hlist_unhashed(&n.link))
 152                        break;
 153
 154                if (!n.halted) {
 155                        local_irq_enable();
 156                        schedule();
 157                        local_irq_disable();
 158                } else {
 159                        /*
 160                         * We cannot reschedule. So halt.
 161                         */
 162                        rcu_irq_exit();
 163                        native_safe_halt();
 164                        rcu_irq_enter();
 165                        local_irq_disable();
 166                }
 167        }
 168        if (!n.halted)
 169                finish_wait(&n.wq, &wait);
 170
 171        rcu_irq_exit();
 172        return;
 173}
 174EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
 175
 176static void apf_task_wake_one(struct kvm_task_sleep_node *n)
 177{
 178        hlist_del_init(&n->link);
 179        if (n->halted)
 180                smp_send_reschedule(n->cpu);
 181        else if (waitqueue_active(&n->wq))
 182                wake_up(&n->wq);
 183}
 184
 185static void apf_task_wake_all(void)
 186{
 187        int i;
 188
 189        for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
 190                struct hlist_node *p, *next;
 191                struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
 192                spin_lock(&b->lock);
 193                hlist_for_each_safe(p, next, &b->list) {
 194                        struct kvm_task_sleep_node *n =
 195                                hlist_entry(p, typeof(*n), link);
 196                        if (n->cpu == smp_processor_id())
 197                                apf_task_wake_one(n);
 198                }
 199                spin_unlock(&b->lock);
 200        }
 201}
 202
 203void kvm_async_pf_task_wake(u32 token)
 204{
 205        u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
 206        struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
 207        struct kvm_task_sleep_node *n;
 208
 209        if (token == ~0) {
 210                apf_task_wake_all();
 211                return;
 212        }
 213
 214again:
 215        spin_lock(&b->lock);
 216        n = _find_apf_task(b, token);
 217        if (!n) {
 218                /*
 219                 * async PF was not yet handled.
 220                 * Add dummy entry for the token.
 221                 */
 222                n = kzalloc(sizeof(*n), GFP_ATOMIC);
 223                if (!n) {
 224                        /*
 225                         * Allocation failed! Busy wait while other cpu
 226                         * handles async PF.
 227                         */
 228                        spin_unlock(&b->lock);
 229                        cpu_relax();
 230                        goto again;
 231                }
 232                n->token = token;
 233                n->cpu = smp_processor_id();
 234                init_waitqueue_head(&n->wq);
 235                hlist_add_head(&n->link, &b->list);
 236        } else
 237                apf_task_wake_one(n);
 238        spin_unlock(&b->lock);
 239        return;
 240}
 241EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
 242
 243u32 kvm_read_and_reset_pf_reason(void)
 244{
 245        u32 reason = 0;
 246
 247        if (__this_cpu_read(apf_reason.enabled)) {
 248                reason = __this_cpu_read(apf_reason.reason);
 249                __this_cpu_write(apf_reason.reason, 0);
 250        }
 251
 252        return reason;
 253}
 254EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
 255NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
 256
 257dotraplinkage void
 258do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
 259{
 260        enum ctx_state prev_state;
 261
 262        switch (kvm_read_and_reset_pf_reason()) {
 263        default:
 264                trace_do_page_fault(regs, error_code);
 265                break;
 266        case KVM_PV_REASON_PAGE_NOT_PRESENT:
 267                /* page is swapped out by the host. */
 268                prev_state = exception_enter();
 269                exit_idle();
 270                kvm_async_pf_task_wait((u32)read_cr2());
 271                exception_exit(prev_state);
 272                break;
 273        case KVM_PV_REASON_PAGE_READY:
 274                rcu_irq_enter();
 275                exit_idle();
 276                kvm_async_pf_task_wake((u32)read_cr2());
 277                rcu_irq_exit();
 278                break;
 279        }
 280}
 281NOKPROBE_SYMBOL(do_async_page_fault);
 282
 283static void __init paravirt_ops_setup(void)
 284{
 285        pv_info.name = "KVM";
 286
 287        /*
 288         * KVM isn't paravirt in the sense of paravirt_enabled.  A KVM
 289         * guest kernel works like a bare metal kernel with additional
 290         * features, and paravirt_enabled is about features that are
 291         * missing.
 292         */
 293        pv_info.paravirt_enabled = 0;
 294
 295        if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
 296                pv_cpu_ops.io_delay = kvm_io_delay;
 297
 298#ifdef CONFIG_X86_IO_APIC
 299        no_timer_check = 1;
 300#endif
 301}
 302
 303static void kvm_register_steal_time(void)
 304{
 305        int cpu = smp_processor_id();
 306        struct kvm_steal_time *st = &per_cpu(steal_time, cpu);
 307
 308        if (!has_steal_clock)
 309                return;
 310
 311        memset(st, 0, sizeof(*st));
 312
 313        wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
 314        pr_info("kvm-stealtime: cpu %d, msr %llx\n",
 315                cpu, (unsigned long long) slow_virt_to_phys(st));
 316}
 317
 318static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
 319
 320static void kvm_guest_apic_eoi_write(u32 reg, u32 val)
 321{
 322        /**
 323         * This relies on __test_and_clear_bit to modify the memory
 324         * in a way that is atomic with respect to the local CPU.
 325         * The hypervisor only accesses this memory from the local CPU so
 326         * there's no need for lock or memory barriers.
 327         * An optimization barrier is implied in apic write.
 328         */
 329        if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi)))
 330                return;
 331        apic_write(APIC_EOI, APIC_EOI_ACK);
 332}
 333
 334static void kvm_guest_cpu_init(void)
 335{
 336        if (!kvm_para_available())
 337                return;
 338
 339        if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
 340                u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
 341
 342#ifdef CONFIG_PREEMPT
 343                pa |= KVM_ASYNC_PF_SEND_ALWAYS;
 344#endif
 345                wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED);
 346                __this_cpu_write(apf_reason.enabled, 1);
 347                printk(KERN_INFO"KVM setup async PF for cpu %d\n",
 348                       smp_processor_id());
 349        }
 350
 351        if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
 352                unsigned long pa;
 353                /* Size alignment is implied but just to make it explicit. */
 354                BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
 355                __this_cpu_write(kvm_apic_eoi, 0);
 356                pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi))
 357                        | KVM_MSR_ENABLED;
 358                wrmsrl(MSR_KVM_PV_EOI_EN, pa);
 359        }
 360
 361        if (has_steal_clock)
 362                kvm_register_steal_time();
 363}
 364
 365static void kvm_pv_disable_apf(void)
 366{
 367        if (!__this_cpu_read(apf_reason.enabled))
 368                return;
 369
 370        wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
 371        __this_cpu_write(apf_reason.enabled, 0);
 372
 373        printk(KERN_INFO"Unregister pv shared memory for cpu %d\n",
 374               smp_processor_id());
 375}
 376
 377static void kvm_pv_guest_cpu_reboot(void *unused)
 378{
 379        /*
 380         * We disable PV EOI before we load a new kernel by kexec,
 381         * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory.
 382         * New kernel can re-enable when it boots.
 383         */
 384        if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
 385                wrmsrl(MSR_KVM_PV_EOI_EN, 0);
 386        kvm_pv_disable_apf();
 387        kvm_disable_steal_time();
 388}
 389
 390static int kvm_pv_reboot_notify(struct notifier_block *nb,
 391                                unsigned long code, void *unused)
 392{
 393        if (code == SYS_RESTART)
 394                on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
 395        return NOTIFY_DONE;
 396}
 397
 398static struct notifier_block kvm_pv_reboot_nb = {
 399        .notifier_call = kvm_pv_reboot_notify,
 400};
 401
 402static u64 kvm_steal_clock(int cpu)
 403{
 404        u64 steal;
 405        struct kvm_steal_time *src;
 406        int version;
 407
 408        src = &per_cpu(steal_time, cpu);
 409        do {
 410                version = src->version;
 411                rmb();
 412                steal = src->steal;
 413                rmb();
 414        } while ((version & 1) || (version != src->version));
 415
 416        return steal;
 417}
 418
 419void kvm_disable_steal_time(void)
 420{
 421        if (!has_steal_clock)
 422                return;
 423
 424        wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
 425}
 426
 427#ifdef CONFIG_SMP
 428static void __init kvm_smp_prepare_boot_cpu(void)
 429{
 430        kvm_guest_cpu_init();
 431        native_smp_prepare_boot_cpu();
 432        kvm_spinlock_init();
 433}
 434
 435static void kvm_guest_cpu_online(void *dummy)
 436{
 437        kvm_guest_cpu_init();
 438}
 439
 440static void kvm_guest_cpu_offline(void *dummy)
 441{
 442        kvm_disable_steal_time();
 443        if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
 444                wrmsrl(MSR_KVM_PV_EOI_EN, 0);
 445        kvm_pv_disable_apf();
 446        apf_task_wake_all();
 447}
 448
 449static int kvm_cpu_notify(struct notifier_block *self, unsigned long action,
 450                          void *hcpu)
 451{
 452        int cpu = (unsigned long)hcpu;
 453        switch (action) {
 454        case CPU_ONLINE:
 455        case CPU_DOWN_FAILED:
 456        case CPU_ONLINE_FROZEN:
 457                smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0);
 458                break;
 459        case CPU_DOWN_PREPARE:
 460        case CPU_DOWN_PREPARE_FROZEN:
 461                smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1);
 462                break;
 463        default:
 464                break;
 465        }
 466        return NOTIFY_OK;
 467}
 468
 469static struct notifier_block kvm_cpu_notifier = {
 470        .notifier_call  = kvm_cpu_notify,
 471};
 472#endif
 473
 474static void __init kvm_apf_trap_init(void)
 475{
 476        set_intr_gate(14, async_page_fault);
 477}
 478
 479void __init kvm_guest_init(void)
 480{
 481        int i;
 482
 483        if (!kvm_para_available())
 484                return;
 485
 486        paravirt_ops_setup();
 487        register_reboot_notifier(&kvm_pv_reboot_nb);
 488        for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
 489                spin_lock_init(&async_pf_sleepers[i].lock);
 490        if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
 491                x86_init.irqs.trap_init = kvm_apf_trap_init;
 492
 493        if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
 494                has_steal_clock = 1;
 495                pv_time_ops.steal_clock = kvm_steal_clock;
 496        }
 497
 498        if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
 499                apic_set_eoi_write(kvm_guest_apic_eoi_write);
 500
 501        if (kvmclock_vsyscall)
 502                kvm_setup_vsyscall_timeinfo();
 503
 504#ifdef CONFIG_SMP
 505        smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
 506        register_cpu_notifier(&kvm_cpu_notifier);
 507#else
 508        kvm_guest_cpu_init();
 509#endif
 510
 511        /*
 512         * Hard lockup detection is enabled by default. Disable it, as guests
 513         * can get false positives too easily, for example if the host is
 514         * overcommitted.
 515         */
 516        hardlockup_detector_disable();
 517}
 518
 519static noinline uint32_t __kvm_cpuid_base(void)
 520{
 521        if (boot_cpu_data.cpuid_level < 0)
 522                return 0;       /* So we don't blow up on old processors */
 523
 524        if (cpu_has_hypervisor)
 525                return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0);
 526
 527        return 0;
 528}
 529
 530static inline uint32_t kvm_cpuid_base(void)
 531{
 532        static int kvm_cpuid_base = -1;
 533
 534        if (kvm_cpuid_base == -1)
 535                kvm_cpuid_base = __kvm_cpuid_base();
 536
 537        return kvm_cpuid_base;
 538}
 539
 540bool kvm_para_available(void)
 541{
 542        return kvm_cpuid_base() != 0;
 543}
 544EXPORT_SYMBOL_GPL(kvm_para_available);
 545
 546unsigned int kvm_arch_para_features(void)
 547{
 548        return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES);
 549}
 550
 551static uint32_t __init kvm_detect(void)
 552{
 553        return kvm_cpuid_base();
 554}
 555
 556const struct hypervisor_x86 x86_hyper_kvm __refconst = {
 557        .name                   = "KVM",
 558        .detect                 = kvm_detect,
 559        .x2apic_available       = kvm_para_available,
 560};
 561EXPORT_SYMBOL_GPL(x86_hyper_kvm);
 562
 563static __init int activate_jump_labels(void)
 564{
 565        if (has_steal_clock) {
 566                static_key_slow_inc(&paravirt_steal_enabled);
 567                if (steal_acc)
 568                        static_key_slow_inc(&paravirt_steal_rq_enabled);
 569        }
 570
 571        return 0;
 572}
 573arch_initcall(activate_jump_labels);
 574
 575#ifdef CONFIG_PARAVIRT_SPINLOCKS
 576
 577/* Kick a cpu by its apicid. Used to wake up a halted vcpu */
 578static void kvm_kick_cpu(int cpu)
 579{
 580        int apicid;
 581        unsigned long flags = 0;
 582
 583        apicid = per_cpu(x86_cpu_to_apicid, cpu);
 584        kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
 585}
 586
 587
 588#ifdef CONFIG_QUEUED_SPINLOCKS
 589
 590#include <asm/qspinlock.h>
 591
 592static void kvm_wait(u8 *ptr, u8 val)
 593{
 594        unsigned long flags;
 595
 596        if (in_nmi())
 597                return;
 598
 599        local_irq_save(flags);
 600
 601        if (READ_ONCE(*ptr) != val)
 602                goto out;
 603
 604        /*
 605         * halt until it's our turn and kicked. Note that we do safe halt
 606         * for irq enabled case to avoid hang when lock info is overwritten
 607         * in irq spinlock slowpath and no spurious interrupt occur to save us.
 608         */
 609        if (arch_irqs_disabled_flags(flags))
 610                halt();
 611        else
 612                safe_halt();
 613
 614out:
 615        local_irq_restore(flags);
 616}
 617
 618#else /* !CONFIG_QUEUED_SPINLOCKS */
 619
 620enum kvm_contention_stat {
 621        TAKEN_SLOW,
 622        TAKEN_SLOW_PICKUP,
 623        RELEASED_SLOW,
 624        RELEASED_SLOW_KICKED,
 625        NR_CONTENTION_STATS
 626};
 627
 628#ifdef CONFIG_KVM_DEBUG_FS
 629#define HISTO_BUCKETS   30
 630
 631static struct kvm_spinlock_stats
 632{
 633        u32 contention_stats[NR_CONTENTION_STATS];
 634        u32 histo_spin_blocked[HISTO_BUCKETS+1];
 635        u64 time_blocked;
 636} spinlock_stats;
 637
 638static u8 zero_stats;
 639
 640static inline void check_zero(void)
 641{
 642        u8 ret;
 643        u8 old;
 644
 645        old = READ_ONCE(zero_stats);
 646        if (unlikely(old)) {
 647                ret = cmpxchg(&zero_stats, old, 0);
 648                /* This ensures only one fellow resets the stat */
 649                if (ret == old)
 650                        memset(&spinlock_stats, 0, sizeof(spinlock_stats));
 651        }
 652}
 653
 654static inline void add_stats(enum kvm_contention_stat var, u32 val)
 655{
 656        check_zero();
 657        spinlock_stats.contention_stats[var] += val;
 658}
 659
 660
 661static inline u64 spin_time_start(void)
 662{
 663        return sched_clock();
 664}
 665
 666static void __spin_time_accum(u64 delta, u32 *array)
 667{
 668        unsigned index;
 669
 670        index = ilog2(delta);
 671        check_zero();
 672
 673        if (index < HISTO_BUCKETS)
 674                array[index]++;
 675        else
 676                array[HISTO_BUCKETS]++;
 677}
 678
 679static inline void spin_time_accum_blocked(u64 start)
 680{
 681        u32 delta;
 682
 683        delta = sched_clock() - start;
 684        __spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
 685        spinlock_stats.time_blocked += delta;
 686}
 687
 688static struct dentry *d_spin_debug;
 689static struct dentry *d_kvm_debug;
 690
 691static struct dentry *kvm_init_debugfs(void)
 692{
 693        d_kvm_debug = debugfs_create_dir("kvm-guest", NULL);
 694        if (!d_kvm_debug)
 695                printk(KERN_WARNING "Could not create 'kvm' debugfs directory\n");
 696
 697        return d_kvm_debug;
 698}
 699
 700static int __init kvm_spinlock_debugfs(void)
 701{
 702        struct dentry *d_kvm;
 703
 704        d_kvm = kvm_init_debugfs();
 705        if (d_kvm == NULL)
 706                return -ENOMEM;
 707
 708        d_spin_debug = debugfs_create_dir("spinlocks", d_kvm);
 709
 710        debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
 711
 712        debugfs_create_u32("taken_slow", 0444, d_spin_debug,
 713                   &spinlock_stats.contention_stats[TAKEN_SLOW]);
 714        debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
 715                   &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]);
 716
 717        debugfs_create_u32("released_slow", 0444, d_spin_debug,
 718                   &spinlock_stats.contention_stats[RELEASED_SLOW]);
 719        debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
 720                   &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]);
 721
 722        debugfs_create_u64("time_blocked", 0444, d_spin_debug,
 723                           &spinlock_stats.time_blocked);
 724
 725        debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
 726                     spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
 727
 728        return 0;
 729}
 730fs_initcall(kvm_spinlock_debugfs);
 731#else  /* !CONFIG_KVM_DEBUG_FS */
 732static inline void add_stats(enum kvm_contention_stat var, u32 val)
 733{
 734}
 735
 736static inline u64 spin_time_start(void)
 737{
 738        return 0;
 739}
 740
 741static inline void spin_time_accum_blocked(u64 start)
 742{
 743}
 744#endif  /* CONFIG_KVM_DEBUG_FS */
 745
 746struct kvm_lock_waiting {
 747        struct arch_spinlock *lock;
 748        __ticket_t want;
 749};
 750
 751/* cpus 'waiting' on a spinlock to become available */
 752static cpumask_t waiting_cpus;
 753
 754/* Track spinlock on which a cpu is waiting */
 755static DEFINE_PER_CPU(struct kvm_lock_waiting, klock_waiting);
 756
 757__visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
 758{
 759        struct kvm_lock_waiting *w;
 760        int cpu;
 761        u64 start;
 762        unsigned long flags;
 763        __ticket_t head;
 764
 765        if (in_nmi())
 766                return;
 767
 768        w = this_cpu_ptr(&klock_waiting);
 769        cpu = smp_processor_id();
 770        start = spin_time_start();
 771
 772        /*
 773         * Make sure an interrupt handler can't upset things in a
 774         * partially setup state.
 775         */
 776        local_irq_save(flags);
 777
 778        /*
 779         * The ordering protocol on this is that the "lock" pointer
 780         * may only be set non-NULL if the "want" ticket is correct.
 781         * If we're updating "want", we must first clear "lock".
 782         */
 783        w->lock = NULL;
 784        smp_wmb();
 785        w->want = want;
 786        smp_wmb();
 787        w->lock = lock;
 788
 789        add_stats(TAKEN_SLOW, 1);
 790
 791        /*
 792         * This uses set_bit, which is atomic but we should not rely on its
 793         * reordering gurantees. So barrier is needed after this call.
 794         */
 795        cpumask_set_cpu(cpu, &waiting_cpus);
 796
 797        barrier();
 798
 799        /*
 800         * Mark entry to slowpath before doing the pickup test to make
 801         * sure we don't deadlock with an unlocker.
 802         */
 803        __ticket_enter_slowpath(lock);
 804
 805        /* make sure enter_slowpath, which is atomic does not cross the read */
 806        smp_mb__after_atomic();
 807
 808        /*
 809         * check again make sure it didn't become free while
 810         * we weren't looking.
 811         */
 812        head = READ_ONCE(lock->tickets.head);
 813        if (__tickets_equal(head, want)) {
 814                add_stats(TAKEN_SLOW_PICKUP, 1);
 815                goto out;
 816        }
 817
 818        /*
 819         * halt until it's our turn and kicked. Note that we do safe halt
 820         * for irq enabled case to avoid hang when lock info is overwritten
 821         * in irq spinlock slowpath and no spurious interrupt occur to save us.
 822         */
 823        if (arch_irqs_disabled_flags(flags))
 824                halt();
 825        else
 826                safe_halt();
 827
 828out:
 829        cpumask_clear_cpu(cpu, &waiting_cpus);
 830        w->lock = NULL;
 831        local_irq_restore(flags);
 832        spin_time_accum_blocked(start);
 833}
 834PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning);
 835
 836/* Kick vcpu waiting on @lock->head to reach value @ticket */
 837static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
 838{
 839        int cpu;
 840
 841        add_stats(RELEASED_SLOW, 1);
 842        for_each_cpu(cpu, &waiting_cpus) {
 843                const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu);
 844                if (READ_ONCE(w->lock) == lock &&
 845                    READ_ONCE(w->want) == ticket) {
 846                        add_stats(RELEASED_SLOW_KICKED, 1);
 847                        kvm_kick_cpu(cpu);
 848                        break;
 849                }
 850        }
 851}
 852
 853#endif /* !CONFIG_QUEUED_SPINLOCKS */
 854
 855/*
 856 * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
 857 */
 858void __init kvm_spinlock_init(void)
 859{
 860        if (!kvm_para_available())
 861                return;
 862        /* Does host kernel support KVM_FEATURE_PV_UNHALT? */
 863        if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
 864                return;
 865
 866#ifdef CONFIG_QUEUED_SPINLOCKS
 867        __pv_init_lock_hash();
 868        pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
 869        pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
 870        pv_lock_ops.wait = kvm_wait;
 871        pv_lock_ops.kick = kvm_kick_cpu;
 872#else /* !CONFIG_QUEUED_SPINLOCKS */
 873        pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning);
 874        pv_lock_ops.unlock_kick = kvm_unlock_kick;
 875#endif
 876}
 877
 878static __init int kvm_spinlock_init_jump(void)
 879{
 880        if (!kvm_para_available())
 881                return 0;
 882        if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
 883                return 0;
 884
 885        static_key_slow_inc(&paravirt_ticketlocks_enabled);
 886        printk(KERN_INFO "KVM setup paravirtual spinlock\n");
 887
 888        return 0;
 889}
 890early_initcall(kvm_spinlock_init_jump);
 891
 892#endif  /* CONFIG_PARAVIRT_SPINLOCKS */
 893