linux/arch/x86/xen/time.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Xen time implementation.
   4 *
   5 * This is implemented in terms of a clocksource driver which uses
   6 * the hypervisor clock as a nanosecond timebase, and a clockevent
   7 * driver which uses the hypervisor's timer mechanism.
   8 *
   9 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  10 */
  11#include <linux/kernel.h>
  12#include <linux/interrupt.h>
  13#include <linux/clocksource.h>
  14#include <linux/clockchips.h>
  15#include <linux/gfp.h>
  16#include <linux/slab.h>
  17#include <linux/pvclock_gtod.h>
  18#include <linux/timekeeper_internal.h>
  19
  20#include <asm/pvclock.h>
  21#include <asm/xen/hypervisor.h>
  22#include <asm/xen/hypercall.h>
  23
  24#include <xen/events.h>
  25#include <xen/features.h>
  26#include <xen/interface/xen.h>
  27#include <xen/interface/vcpu.h>
  28
  29#include "xen-ops.h"
  30
  31/* Minimum amount of time until next clock event fires */
  32#define TIMER_SLOP      100000
  33
  34static u64 xen_sched_clock_offset __read_mostly;
  35
  36/* Get the TSC speed from Xen */
  37static unsigned long xen_tsc_khz(void)
  38{
  39        struct pvclock_vcpu_time_info *info =
  40                &HYPERVISOR_shared_info->vcpu_info[0].time;
  41
  42        setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
  43        return pvclock_tsc_khz(info);
  44}
  45
  46static u64 xen_clocksource_read(void)
  47{
  48        struct pvclock_vcpu_time_info *src;
  49        u64 ret;
  50
  51        preempt_disable_notrace();
  52        src = &__this_cpu_read(xen_vcpu)->time;
  53        ret = pvclock_clocksource_read(src);
  54        preempt_enable_notrace();
  55        return ret;
  56}
  57
  58static u64 xen_clocksource_get_cycles(struct clocksource *cs)
  59{
  60        return xen_clocksource_read();
  61}
  62
  63static u64 xen_sched_clock(void)
  64{
  65        return xen_clocksource_read() - xen_sched_clock_offset;
  66}
  67
  68static void xen_read_wallclock(struct timespec64 *ts)
  69{
  70        struct shared_info *s = HYPERVISOR_shared_info;
  71        struct pvclock_wall_clock *wall_clock = &(s->wc);
  72        struct pvclock_vcpu_time_info *vcpu_time;
  73
  74        vcpu_time = &get_cpu_var(xen_vcpu)->time;
  75        pvclock_read_wallclock(wall_clock, vcpu_time, ts);
  76        put_cpu_var(xen_vcpu);
  77}
  78
  79static void xen_get_wallclock(struct timespec64 *now)
  80{
  81        xen_read_wallclock(now);
  82}
  83
  84static int xen_set_wallclock(const struct timespec64 *now)
  85{
  86        return -ENODEV;
  87}
  88
  89static int xen_pvclock_gtod_notify(struct notifier_block *nb,
  90                                   unsigned long was_set, void *priv)
  91{
  92        /* Protected by the calling core code serialization */
  93        static struct timespec64 next_sync;
  94
  95        struct xen_platform_op op;
  96        struct timespec64 now;
  97        struct timekeeper *tk = priv;
  98        static bool settime64_supported = true;
  99        int ret;
 100
 101        now.tv_sec = tk->xtime_sec;
 102        now.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
 103
 104        /*
 105         * We only take the expensive HV call when the clock was set
 106         * or when the 11 minutes RTC synchronization time elapsed.
 107         */
 108        if (!was_set && timespec64_compare(&now, &next_sync) < 0)
 109                return NOTIFY_OK;
 110
 111again:
 112        if (settime64_supported) {
 113                op.cmd = XENPF_settime64;
 114                op.u.settime64.mbz = 0;
 115                op.u.settime64.secs = now.tv_sec;
 116                op.u.settime64.nsecs = now.tv_nsec;
 117                op.u.settime64.system_time = xen_clocksource_read();
 118        } else {
 119                op.cmd = XENPF_settime32;
 120                op.u.settime32.secs = now.tv_sec;
 121                op.u.settime32.nsecs = now.tv_nsec;
 122                op.u.settime32.system_time = xen_clocksource_read();
 123        }
 124
 125        ret = HYPERVISOR_platform_op(&op);
 126
 127        if (ret == -ENOSYS && settime64_supported) {
 128                settime64_supported = false;
 129                goto again;
 130        }
 131        if (ret < 0)
 132                return NOTIFY_BAD;
 133
 134        /*
 135         * Move the next drift compensation time 11 minutes
 136         * ahead. That's emulating the sync_cmos_clock() update for
 137         * the hardware RTC.
 138         */
 139        next_sync = now;
 140        next_sync.tv_sec += 11 * 60;
 141
 142        return NOTIFY_OK;
 143}
 144
 145static struct notifier_block xen_pvclock_gtod_notifier = {
 146        .notifier_call = xen_pvclock_gtod_notify,
 147};
 148
 149static int xen_cs_enable(struct clocksource *cs)
 150{
 151        vclocks_set_used(VDSO_CLOCKMODE_PVCLOCK);
 152        return 0;
 153}
 154
 155static struct clocksource xen_clocksource __read_mostly = {
 156        .name   = "xen",
 157        .rating = 400,
 158        .read   = xen_clocksource_get_cycles,
 159        .mask   = CLOCKSOURCE_MASK(64),
 160        .flags  = CLOCK_SOURCE_IS_CONTINUOUS,
 161        .enable = xen_cs_enable,
 162};
 163
 164/*
 165   Xen clockevent implementation
 166
 167   Xen has two clockevent implementations:
 168
 169   The old timer_op one works with all released versions of Xen prior
 170   to version 3.0.4.  This version of the hypervisor provides a
 171   single-shot timer with nanosecond resolution.  However, sharing the
 172   same event channel is a 100Hz tick which is delivered while the
 173   vcpu is running.  We don't care about or use this tick, but it will
 174   cause the core time code to think the timer fired too soon, and
 175   will end up resetting it each time.  It could be filtered, but
 176   doing so has complications when the ktime clocksource is not yet
 177   the xen clocksource (ie, at boot time).
 178
 179   The new vcpu_op-based timer interface allows the tick timer period
 180   to be changed or turned off.  The tick timer is not useful as a
 181   periodic timer because events are only delivered to running vcpus.
 182   The one-shot timer can report when a timeout is in the past, so
 183   set_next_event is capable of returning -ETIME when appropriate.
 184   This interface is used when available.
 185*/
 186
 187
 188/*
 189  Get a hypervisor absolute time.  In theory we could maintain an
 190  offset between the kernel's time and the hypervisor's time, and
 191  apply that to a kernel's absolute timeout.  Unfortunately the
 192  hypervisor and kernel times can drift even if the kernel is using
 193  the Xen clocksource, because ntp can warp the kernel's clocksource.
 194*/
 195static s64 get_abs_timeout(unsigned long delta)
 196{
 197        return xen_clocksource_read() + delta;
 198}
 199
 200static int xen_timerop_shutdown(struct clock_event_device *evt)
 201{
 202        /* cancel timeout */
 203        HYPERVISOR_set_timer_op(0);
 204
 205        return 0;
 206}
 207
 208static int xen_timerop_set_next_event(unsigned long delta,
 209                                      struct clock_event_device *evt)
 210{
 211        WARN_ON(!clockevent_state_oneshot(evt));
 212
 213        if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
 214                BUG();
 215
 216        /* We may have missed the deadline, but there's no real way of
 217           knowing for sure.  If the event was in the past, then we'll
 218           get an immediate interrupt. */
 219
 220        return 0;
 221}
 222
 223static struct clock_event_device xen_timerop_clockevent __ro_after_init = {
 224        .name                   = "xen",
 225        .features               = CLOCK_EVT_FEAT_ONESHOT,
 226
 227        .max_delta_ns           = 0xffffffff,
 228        .max_delta_ticks        = 0xffffffff,
 229        .min_delta_ns           = TIMER_SLOP,
 230        .min_delta_ticks        = TIMER_SLOP,
 231
 232        .mult                   = 1,
 233        .shift                  = 0,
 234        .rating                 = 500,
 235
 236        .set_state_shutdown     = xen_timerop_shutdown,
 237        .set_next_event         = xen_timerop_set_next_event,
 238};
 239
 240static int xen_vcpuop_shutdown(struct clock_event_device *evt)
 241{
 242        int cpu = smp_processor_id();
 243
 244        if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, xen_vcpu_nr(cpu),
 245                               NULL) ||
 246            HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu),
 247                               NULL))
 248                BUG();
 249
 250        return 0;
 251}
 252
 253static int xen_vcpuop_set_oneshot(struct clock_event_device *evt)
 254{
 255        int cpu = smp_processor_id();
 256
 257        if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu),
 258                               NULL))
 259                BUG();
 260
 261        return 0;
 262}
 263
 264static int xen_vcpuop_set_next_event(unsigned long delta,
 265                                     struct clock_event_device *evt)
 266{
 267        int cpu = smp_processor_id();
 268        struct vcpu_set_singleshot_timer single;
 269        int ret;
 270
 271        WARN_ON(!clockevent_state_oneshot(evt));
 272
 273        single.timeout_abs_ns = get_abs_timeout(delta);
 274        /* Get an event anyway, even if the timeout is already expired */
 275        single.flags = 0;
 276
 277        ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, xen_vcpu_nr(cpu),
 278                                 &single);
 279        BUG_ON(ret != 0);
 280
 281        return ret;
 282}
 283
 284static struct clock_event_device xen_vcpuop_clockevent __ro_after_init = {
 285        .name = "xen",
 286        .features = CLOCK_EVT_FEAT_ONESHOT,
 287
 288        .max_delta_ns = 0xffffffff,
 289        .max_delta_ticks = 0xffffffff,
 290        .min_delta_ns = TIMER_SLOP,
 291        .min_delta_ticks = TIMER_SLOP,
 292
 293        .mult = 1,
 294        .shift = 0,
 295        .rating = 500,
 296
 297        .set_state_shutdown = xen_vcpuop_shutdown,
 298        .set_state_oneshot = xen_vcpuop_set_oneshot,
 299        .set_next_event = xen_vcpuop_set_next_event,
 300};
 301
 302static const struct clock_event_device *xen_clockevent =
 303        &xen_timerop_clockevent;
 304
 305struct xen_clock_event_device {
 306        struct clock_event_device evt;
 307        char name[16];
 308};
 309static DEFINE_PER_CPU(struct xen_clock_event_device, xen_clock_events) = { .evt.irq = -1 };
 310
 311static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
 312{
 313        struct clock_event_device *evt = this_cpu_ptr(&xen_clock_events.evt);
 314        irqreturn_t ret;
 315
 316        ret = IRQ_NONE;
 317        if (evt->event_handler) {
 318                evt->event_handler(evt);
 319                ret = IRQ_HANDLED;
 320        }
 321
 322        return ret;
 323}
 324
 325void xen_teardown_timer(int cpu)
 326{
 327        struct clock_event_device *evt;
 328        evt = &per_cpu(xen_clock_events, cpu).evt;
 329
 330        if (evt->irq >= 0) {
 331                unbind_from_irqhandler(evt->irq, NULL);
 332                evt->irq = -1;
 333        }
 334}
 335
 336void xen_setup_timer(int cpu)
 337{
 338        struct xen_clock_event_device *xevt = &per_cpu(xen_clock_events, cpu);
 339        struct clock_event_device *evt = &xevt->evt;
 340        int irq;
 341
 342        WARN(evt->irq >= 0, "IRQ%d for CPU%d is already allocated\n", evt->irq, cpu);
 343        if (evt->irq >= 0)
 344                xen_teardown_timer(cpu);
 345
 346        printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
 347
 348        snprintf(xevt->name, sizeof(xevt->name), "timer%d", cpu);
 349
 350        irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
 351                                      IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER|
 352                                      IRQF_FORCE_RESUME|IRQF_EARLY_RESUME,
 353                                      xevt->name, NULL);
 354        (void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX);
 355
 356        memcpy(evt, xen_clockevent, sizeof(*evt));
 357
 358        evt->cpumask = cpumask_of(cpu);
 359        evt->irq = irq;
 360}
 361
 362
 363void xen_setup_cpu_clockevents(void)
 364{
 365        clockevents_register_device(this_cpu_ptr(&xen_clock_events.evt));
 366}
 367
 368void xen_timer_resume(void)
 369{
 370        int cpu;
 371
 372        if (xen_clockevent != &xen_vcpuop_clockevent)
 373                return;
 374
 375        for_each_online_cpu(cpu) {
 376                if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer,
 377                                       xen_vcpu_nr(cpu), NULL))
 378                        BUG();
 379        }
 380}
 381
 382static struct pvclock_vsyscall_time_info *xen_clock __read_mostly;
 383static u64 xen_clock_value_saved;
 384
 385void xen_save_time_memory_area(void)
 386{
 387        struct vcpu_register_time_memory_area t;
 388        int ret;
 389
 390        xen_clock_value_saved = xen_clocksource_read() - xen_sched_clock_offset;
 391
 392        if (!xen_clock)
 393                return;
 394
 395        t.addr.v = NULL;
 396
 397        ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
 398        if (ret != 0)
 399                pr_notice("Cannot save secondary vcpu_time_info (err %d)",
 400                          ret);
 401        else
 402                clear_page(xen_clock);
 403}
 404
 405void xen_restore_time_memory_area(void)
 406{
 407        struct vcpu_register_time_memory_area t;
 408        int ret;
 409
 410        if (!xen_clock)
 411                goto out;
 412
 413        t.addr.v = &xen_clock->pvti;
 414
 415        ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
 416
 417        /*
 418         * We don't disable VDSO_CLOCKMODE_PVCLOCK entirely if it fails to
 419         * register the secondary time info with Xen or if we migrated to a
 420         * host without the necessary flags. On both of these cases what
 421         * happens is either process seeing a zeroed out pvti or seeing no
 422         * PVCLOCK_TSC_STABLE_BIT bit set. Userspace checks the latter and
 423         * if 0, it discards the data in pvti and fallbacks to a system
 424         * call for a reliable timestamp.
 425         */
 426        if (ret != 0)
 427                pr_notice("Cannot restore secondary vcpu_time_info (err %d)",
 428                          ret);
 429
 430out:
 431        /* Need pvclock_resume() before using xen_clocksource_read(). */
 432        pvclock_resume();
 433        xen_sched_clock_offset = xen_clocksource_read() - xen_clock_value_saved;
 434}
 435
 436static void xen_setup_vsyscall_time_info(void)
 437{
 438        struct vcpu_register_time_memory_area t;
 439        struct pvclock_vsyscall_time_info *ti;
 440        int ret;
 441
 442        ti = (struct pvclock_vsyscall_time_info *)get_zeroed_page(GFP_KERNEL);
 443        if (!ti)
 444                return;
 445
 446        t.addr.v = &ti->pvti;
 447
 448        ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
 449        if (ret) {
 450                pr_notice("xen: VDSO_CLOCKMODE_PVCLOCK not supported (err %d)\n", ret);
 451                free_page((unsigned long)ti);
 452                return;
 453        }
 454
 455        /*
 456         * If primary time info had this bit set, secondary should too since
 457         * it's the same data on both just different memory regions. But we
 458         * still check it in case hypervisor is buggy.
 459         */
 460        if (!(ti->pvti.flags & PVCLOCK_TSC_STABLE_BIT)) {
 461                t.addr.v = NULL;
 462                ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area,
 463                                         0, &t);
 464                if (!ret)
 465                        free_page((unsigned long)ti);
 466
 467                pr_notice("xen: VDSO_CLOCKMODE_PVCLOCK not supported (tsc unstable)\n");
 468                return;
 469        }
 470
 471        xen_clock = ti;
 472        pvclock_set_pvti_cpu0_va(xen_clock);
 473
 474        xen_clocksource.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK;
 475}
 476
 477static void __init xen_time_init(void)
 478{
 479        struct pvclock_vcpu_time_info *pvti;
 480        int cpu = smp_processor_id();
 481        struct timespec64 tp;
 482
 483        /* As Dom0 is never moved, no penalty on using TSC there */
 484        if (xen_initial_domain())
 485                xen_clocksource.rating = 275;
 486
 487        clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
 488
 489        if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu),
 490                               NULL) == 0) {
 491                /* Successfully turned off 100Hz tick, so we have the
 492                   vcpuop-based timer interface */
 493                printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
 494                xen_clockevent = &xen_vcpuop_clockevent;
 495        }
 496
 497        /* Set initial system time with full resolution */
 498        xen_read_wallclock(&tp);
 499        do_settimeofday64(&tp);
 500
 501        setup_force_cpu_cap(X86_FEATURE_TSC);
 502
 503        /*
 504         * We check ahead on the primary time info if this
 505         * bit is supported hence speeding up Xen clocksource.
 506         */
 507        pvti = &__this_cpu_read(xen_vcpu)->time;
 508        if (pvti->flags & PVCLOCK_TSC_STABLE_BIT) {
 509                pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
 510                xen_setup_vsyscall_time_info();
 511        }
 512
 513        xen_setup_runstate_info(cpu);
 514        xen_setup_timer(cpu);
 515        xen_setup_cpu_clockevents();
 516
 517        xen_time_setup_guest();
 518
 519        if (xen_initial_domain())
 520                pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier);
 521}
 522
 523static void __init xen_init_time_common(void)
 524{
 525        xen_sched_clock_offset = xen_clocksource_read();
 526        static_call_update(pv_steal_clock, xen_steal_clock);
 527        paravirt_set_sched_clock(xen_sched_clock);
 528
 529        x86_platform.calibrate_tsc = xen_tsc_khz;
 530        x86_platform.get_wallclock = xen_get_wallclock;
 531}
 532
 533void __init xen_init_time_ops(void)
 534{
 535        xen_init_time_common();
 536
 537        x86_init.timers.timer_init = xen_time_init;
 538        x86_init.timers.setup_percpu_clockev = x86_init_noop;
 539        x86_cpuinit.setup_percpu_clockev = x86_init_noop;
 540
 541        /* Dom0 uses the native method to set the hardware RTC. */
 542        if (!xen_initial_domain())
 543                x86_platform.set_wallclock = xen_set_wallclock;
 544}
 545
 546#ifdef CONFIG_XEN_PVHVM
 547static void xen_hvm_setup_cpu_clockevents(void)
 548{
 549        int cpu = smp_processor_id();
 550        xen_setup_runstate_info(cpu);
 551        /*
 552         * xen_setup_timer(cpu) - snprintf is bad in atomic context. Hence
 553         * doing it xen_hvm_cpu_notify (which gets called by smp_init during
 554         * early bootup and also during CPU hotplug events).
 555         */
 556        xen_setup_cpu_clockevents();
 557}
 558
 559void __init xen_hvm_init_time_ops(void)
 560{
 561        /*
 562         * vector callback is needed otherwise we cannot receive interrupts
 563         * on cpu > 0 and at this point we don't know how many cpus are
 564         * available.
 565         */
 566        if (!xen_have_vector_callback)
 567                return;
 568
 569        if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
 570                pr_info("Xen doesn't support pvclock on HVM, disable pv timer");
 571                return;
 572        }
 573
 574        xen_init_time_common();
 575
 576        x86_init.timers.setup_percpu_clockev = xen_time_init;
 577        x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
 578
 579        x86_platform.set_wallclock = xen_set_wallclock;
 580}
 581#endif
 582
 583/* Kernel parameter to specify Xen timer slop */
 584static int __init parse_xen_timer_slop(char *ptr)
 585{
 586        unsigned long slop = memparse(ptr, NULL);
 587
 588        xen_timerop_clockevent.min_delta_ns = slop;
 589        xen_timerop_clockevent.min_delta_ticks = slop;
 590        xen_vcpuop_clockevent.min_delta_ns = slop;
 591        xen_vcpuop_clockevent.min_delta_ticks = slop;
 592
 593        return 0;
 594}
 595early_param("xen_timer_slop", parse_xen_timer_slop);
 596