linux/arch/x86/xen/time.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Xen time implementation.
   4 *
   5 * This is implemented in terms of a clocksource driver which uses
   6 * the hypervisor clock as a nanosecond timebase, and a clockevent
   7 * driver which uses the hypervisor's timer mechanism.
   8 *
   9 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  10 */
  11#include <linux/kernel.h>
  12#include <linux/interrupt.h>
  13#include <linux/clocksource.h>
  14#include <linux/clockchips.h>
  15#include <linux/gfp.h>
  16#include <linux/slab.h>
  17#include <linux/pvclock_gtod.h>
  18#include <linux/timekeeper_internal.h>
  19
  20#include <asm/pvclock.h>
  21#include <asm/xen/hypervisor.h>
  22#include <asm/xen/hypercall.h>
  23
  24#include <xen/events.h>
  25#include <xen/features.h>
  26#include <xen/interface/xen.h>
  27#include <xen/interface/vcpu.h>
  28
  29#include "xen-ops.h"
  30
  31/* Minimum amount of time until next clock event fires */
  32#define TIMER_SLOP      100000
  33
  34static u64 xen_sched_clock_offset __read_mostly;
  35
  36/* Get the TSC speed from Xen */
  37static unsigned long xen_tsc_khz(void)
  38{
  39        struct pvclock_vcpu_time_info *info =
  40                &HYPERVISOR_shared_info->vcpu_info[0].time;
  41
  42        setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
  43        return pvclock_tsc_khz(info);
  44}
  45
  46static u64 xen_clocksource_read(void)
  47{
  48        struct pvclock_vcpu_time_info *src;
  49        u64 ret;
  50
  51        preempt_disable_notrace();
  52        src = &__this_cpu_read(xen_vcpu)->time;
  53        ret = pvclock_clocksource_read(src);
  54        preempt_enable_notrace();
  55        return ret;
  56}
  57
  58static u64 xen_clocksource_get_cycles(struct clocksource *cs)
  59{
  60        return xen_clocksource_read();
  61}
  62
  63static u64 xen_sched_clock(void)
  64{
  65        return xen_clocksource_read() - xen_sched_clock_offset;
  66}
  67
  68static void xen_read_wallclock(struct timespec64 *ts)
  69{
  70        struct shared_info *s = HYPERVISOR_shared_info;
  71        struct pvclock_wall_clock *wall_clock = &(s->wc);
  72        struct pvclock_vcpu_time_info *vcpu_time;
  73
  74        vcpu_time = &get_cpu_var(xen_vcpu)->time;
  75        pvclock_read_wallclock(wall_clock, vcpu_time, ts);
  76        put_cpu_var(xen_vcpu);
  77}
  78
  79static void xen_get_wallclock(struct timespec64 *now)
  80{
  81        xen_read_wallclock(now);
  82}
  83
  84static int xen_set_wallclock(const struct timespec64 *now)
  85{
  86        return -ENODEV;
  87}
  88
  89static int xen_pvclock_gtod_notify(struct notifier_block *nb,
  90                                   unsigned long was_set, void *priv)
  91{
  92        /* Protected by the calling core code serialization */
  93        static struct timespec64 next_sync;
  94
  95        struct xen_platform_op op;
  96        struct timespec64 now;
  97        struct timekeeper *tk = priv;
  98        static bool settime64_supported = true;
  99        int ret;
 100
 101        now.tv_sec = tk->xtime_sec;
 102        now.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
 103
 104        /*
 105         * We only take the expensive HV call when the clock was set
 106         * or when the 11 minutes RTC synchronization time elapsed.
 107         */
 108        if (!was_set && timespec64_compare(&now, &next_sync) < 0)
 109                return NOTIFY_OK;
 110
 111again:
 112        if (settime64_supported) {
 113                op.cmd = XENPF_settime64;
 114                op.u.settime64.mbz = 0;
 115                op.u.settime64.secs = now.tv_sec;
 116                op.u.settime64.nsecs = now.tv_nsec;
 117                op.u.settime64.system_time = xen_clocksource_read();
 118        } else {
 119                op.cmd = XENPF_settime32;
 120                op.u.settime32.secs = now.tv_sec;
 121                op.u.settime32.nsecs = now.tv_nsec;
 122                op.u.settime32.system_time = xen_clocksource_read();
 123        }
 124
 125        ret = HYPERVISOR_platform_op(&op);
 126
 127        if (ret == -ENOSYS && settime64_supported) {
 128                settime64_supported = false;
 129                goto again;
 130        }
 131        if (ret < 0)
 132                return NOTIFY_BAD;
 133
 134        /*
 135         * Move the next drift compensation time 11 minutes
 136         * ahead. That's emulating the sync_cmos_clock() update for
 137         * the hardware RTC.
 138         */
 139        next_sync = now;
 140        next_sync.tv_sec += 11 * 60;
 141
 142        return NOTIFY_OK;
 143}
 144
 145static struct notifier_block xen_pvclock_gtod_notifier = {
 146        .notifier_call = xen_pvclock_gtod_notify,
 147};
 148
 149static int xen_cs_enable(struct clocksource *cs)
 150{
 151        vclocks_set_used(VDSO_CLOCKMODE_PVCLOCK);
 152        return 0;
 153}
 154
 155static struct clocksource xen_clocksource __read_mostly = {
 156        .name   = "xen",
 157        .rating = 400,
 158        .read   = xen_clocksource_get_cycles,
 159        .mask   = CLOCKSOURCE_MASK(64),
 160        .flags  = CLOCK_SOURCE_IS_CONTINUOUS,
 161        .enable = xen_cs_enable,
 162};
 163
 164/*
 165   Xen clockevent implementation
 166
 167   Xen has two clockevent implementations:
 168
 169   The old timer_op one works with all released versions of Xen prior
 170   to version 3.0.4.  This version of the hypervisor provides a
 171   single-shot timer with nanosecond resolution.  However, sharing the
 172   same event channel is a 100Hz tick which is delivered while the
 173   vcpu is running.  We don't care about or use this tick, but it will
 174   cause the core time code to think the timer fired too soon, and
 175   will end up resetting it each time.  It could be filtered, but
 176   doing so has complications when the ktime clocksource is not yet
 177   the xen clocksource (ie, at boot time).
 178
 179   The new vcpu_op-based timer interface allows the tick timer period
 180   to be changed or turned off.  The tick timer is not useful as a
 181   periodic timer because events are only delivered to running vcpus.
 182   The one-shot timer can report when a timeout is in the past, so
 183   set_next_event is capable of returning -ETIME when appropriate.
 184   This interface is used when available.
 185*/
 186
 187
 188/*
 189  Get a hypervisor absolute time.  In theory we could maintain an
 190  offset between the kernel's time and the hypervisor's time, and
 191  apply that to a kernel's absolute timeout.  Unfortunately the
 192  hypervisor and kernel times can drift even if the kernel is using
 193  the Xen clocksource, because ntp can warp the kernel's clocksource.
 194*/
 195static s64 get_abs_timeout(unsigned long delta)
 196{
 197        return xen_clocksource_read() + delta;
 198}
 199
 200static int xen_timerop_shutdown(struct clock_event_device *evt)
 201{
 202        /* cancel timeout */
 203        HYPERVISOR_set_timer_op(0);
 204
 205        return 0;
 206}
 207
 208static int xen_timerop_set_next_event(unsigned long delta,
 209                                      struct clock_event_device *evt)
 210{
 211        WARN_ON(!clockevent_state_oneshot(evt));
 212
 213        if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
 214                BUG();
 215
 216        /* We may have missed the deadline, but there's no real way of
 217           knowing for sure.  If the event was in the past, then we'll
 218           get an immediate interrupt. */
 219
 220        return 0;
 221}
 222
 223static struct clock_event_device xen_timerop_clockevent __ro_after_init = {
 224        .name                   = "xen",
 225        .features               = CLOCK_EVT_FEAT_ONESHOT,
 226
 227        .max_delta_ns           = 0xffffffff,
 228        .max_delta_ticks        = 0xffffffff,
 229        .min_delta_ns           = TIMER_SLOP,
 230        .min_delta_ticks        = TIMER_SLOP,
 231
 232        .mult                   = 1,
 233        .shift                  = 0,
 234        .rating                 = 500,
 235
 236        .set_state_shutdown     = xen_timerop_shutdown,
 237        .set_next_event         = xen_timerop_set_next_event,
 238};
 239
 240static int xen_vcpuop_shutdown(struct clock_event_device *evt)
 241{
 242        int cpu = smp_processor_id();
 243
 244        if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, xen_vcpu_nr(cpu),
 245                               NULL) ||
 246            HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu),
 247                               NULL))
 248                BUG();
 249
 250        return 0;
 251}
 252
 253static int xen_vcpuop_set_oneshot(struct clock_event_device *evt)
 254{
 255        int cpu = smp_processor_id();
 256
 257        if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu),
 258                               NULL))
 259                BUG();
 260
 261        return 0;
 262}
 263
 264static int xen_vcpuop_set_next_event(unsigned long delta,
 265                                     struct clock_event_device *evt)
 266{
 267        int cpu = smp_processor_id();
 268        struct vcpu_set_singleshot_timer single;
 269        int ret;
 270
 271        WARN_ON(!clockevent_state_oneshot(evt));
 272
 273        single.timeout_abs_ns = get_abs_timeout(delta);
 274        /* Get an event anyway, even if the timeout is already expired */
 275        single.flags = 0;
 276
 277        ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, xen_vcpu_nr(cpu),
 278                                 &single);
 279        BUG_ON(ret != 0);
 280
 281        return ret;
 282}
 283
 284static struct clock_event_device xen_vcpuop_clockevent __ro_after_init = {
 285        .name = "xen",
 286        .features = CLOCK_EVT_FEAT_ONESHOT,
 287
 288        .max_delta_ns = 0xffffffff,
 289        .max_delta_ticks = 0xffffffff,
 290        .min_delta_ns = TIMER_SLOP,
 291        .min_delta_ticks = TIMER_SLOP,
 292
 293        .mult = 1,
 294        .shift = 0,
 295        .rating = 500,
 296
 297        .set_state_shutdown = xen_vcpuop_shutdown,
 298        .set_state_oneshot = xen_vcpuop_set_oneshot,
 299        .set_next_event = xen_vcpuop_set_next_event,
 300};
 301
 302static const struct clock_event_device *xen_clockevent =
 303        &xen_timerop_clockevent;
 304
 305struct xen_clock_event_device {
 306        struct clock_event_device evt;
 307        char name[16];
 308};
 309static DEFINE_PER_CPU(struct xen_clock_event_device, xen_clock_events) = { .evt.irq = -1 };
 310
 311static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
 312{
 313        struct clock_event_device *evt = this_cpu_ptr(&xen_clock_events.evt);
 314        irqreturn_t ret;
 315
 316        ret = IRQ_NONE;
 317        if (evt->event_handler) {
 318                evt->event_handler(evt);
 319                ret = IRQ_HANDLED;
 320        }
 321
 322        return ret;
 323}
 324
 325void xen_teardown_timer(int cpu)
 326{
 327        struct clock_event_device *evt;
 328        evt = &per_cpu(xen_clock_events, cpu).evt;
 329
 330        if (evt->irq >= 0) {
 331                unbind_from_irqhandler(evt->irq, NULL);
 332                evt->irq = -1;
 333        }
 334}
 335
 336void xen_setup_timer(int cpu)
 337{
 338        struct xen_clock_event_device *xevt = &per_cpu(xen_clock_events, cpu);
 339        struct clock_event_device *evt = &xevt->evt;
 340        int irq;
 341
 342        WARN(evt->irq >= 0, "IRQ%d for CPU%d is already allocated\n", evt->irq, cpu);
 343        if (evt->irq >= 0)
 344                xen_teardown_timer(cpu);
 345
 346        printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
 347
 348        snprintf(xevt->name, sizeof(xevt->name), "timer%d", cpu);
 349
 350        irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
 351                                      IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER|
 352                                      IRQF_FORCE_RESUME|IRQF_EARLY_RESUME,
 353                                      xevt->name, NULL);
 354        (void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX);
 355
 356        memcpy(evt, xen_clockevent, sizeof(*evt));
 357
 358        evt->cpumask = cpumask_of(cpu);
 359        evt->irq = irq;
 360}
 361
 362
 363void xen_setup_cpu_clockevents(void)
 364{
 365        clockevents_register_device(this_cpu_ptr(&xen_clock_events.evt));
 366}
 367
 368void xen_timer_resume(void)
 369{
 370        int cpu;
 371
 372        if (xen_clockevent != &xen_vcpuop_clockevent)
 373                return;
 374
 375        for_each_online_cpu(cpu) {
 376                if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer,
 377                                       xen_vcpu_nr(cpu), NULL))
 378                        BUG();
 379        }
 380}
 381
 382static const struct pv_time_ops xen_time_ops __initconst = {
 383        .sched_clock = xen_sched_clock,
 384        .steal_clock = xen_steal_clock,
 385};
 386
 387static struct pvclock_vsyscall_time_info *xen_clock __read_mostly;
 388static u64 xen_clock_value_saved;
 389
 390void xen_save_time_memory_area(void)
 391{
 392        struct vcpu_register_time_memory_area t;
 393        int ret;
 394
 395        xen_clock_value_saved = xen_clocksource_read() - xen_sched_clock_offset;
 396
 397        if (!xen_clock)
 398                return;
 399
 400        t.addr.v = NULL;
 401
 402        ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
 403        if (ret != 0)
 404                pr_notice("Cannot save secondary vcpu_time_info (err %d)",
 405                          ret);
 406        else
 407                clear_page(xen_clock);
 408}
 409
 410void xen_restore_time_memory_area(void)
 411{
 412        struct vcpu_register_time_memory_area t;
 413        int ret;
 414
 415        if (!xen_clock)
 416                goto out;
 417
 418        t.addr.v = &xen_clock->pvti;
 419
 420        ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
 421
 422        /*
 423         * We don't disable VDSO_CLOCKMODE_PVCLOCK entirely if it fails to
 424         * register the secondary time info with Xen or if we migrated to a
 425         * host without the necessary flags. On both of these cases what
 426         * happens is either process seeing a zeroed out pvti or seeing no
 427         * PVCLOCK_TSC_STABLE_BIT bit set. Userspace checks the latter and
 428         * if 0, it discards the data in pvti and fallbacks to a system
 429         * call for a reliable timestamp.
 430         */
 431        if (ret != 0)
 432                pr_notice("Cannot restore secondary vcpu_time_info (err %d)",
 433                          ret);
 434
 435out:
 436        /* Need pvclock_resume() before using xen_clocksource_read(). */
 437        pvclock_resume();
 438        xen_sched_clock_offset = xen_clocksource_read() - xen_clock_value_saved;
 439}
 440
 441static void xen_setup_vsyscall_time_info(void)
 442{
 443        struct vcpu_register_time_memory_area t;
 444        struct pvclock_vsyscall_time_info *ti;
 445        int ret;
 446
 447        ti = (struct pvclock_vsyscall_time_info *)get_zeroed_page(GFP_KERNEL);
 448        if (!ti)
 449                return;
 450
 451        t.addr.v = &ti->pvti;
 452
 453        ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
 454        if (ret) {
 455                pr_notice("xen: VDSO_CLOCKMODE_PVCLOCK not supported (err %d)\n", ret);
 456                free_page((unsigned long)ti);
 457                return;
 458        }
 459
 460        /*
 461         * If primary time info had this bit set, secondary should too since
 462         * it's the same data on both just different memory regions. But we
 463         * still check it in case hypervisor is buggy.
 464         */
 465        if (!(ti->pvti.flags & PVCLOCK_TSC_STABLE_BIT)) {
 466                t.addr.v = NULL;
 467                ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area,
 468                                         0, &t);
 469                if (!ret)
 470                        free_page((unsigned long)ti);
 471
 472                pr_notice("xen: VDSO_CLOCKMODE_PVCLOCK not supported (tsc unstable)\n");
 473                return;
 474        }
 475
 476        xen_clock = ti;
 477        pvclock_set_pvti_cpu0_va(xen_clock);
 478
 479        xen_clocksource.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK;
 480}
 481
 482static void __init xen_time_init(void)
 483{
 484        struct pvclock_vcpu_time_info *pvti;
 485        int cpu = smp_processor_id();
 486        struct timespec64 tp;
 487
 488        /* As Dom0 is never moved, no penalty on using TSC there */
 489        if (xen_initial_domain())
 490                xen_clocksource.rating = 275;
 491
 492        clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
 493
 494        if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu),
 495                               NULL) == 0) {
 496                /* Successfully turned off 100Hz tick, so we have the
 497                   vcpuop-based timer interface */
 498                printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
 499                xen_clockevent = &xen_vcpuop_clockevent;
 500        }
 501
 502        /* Set initial system time with full resolution */
 503        xen_read_wallclock(&tp);
 504        do_settimeofday64(&tp);
 505
 506        setup_force_cpu_cap(X86_FEATURE_TSC);
 507
 508        /*
 509         * We check ahead on the primary time info if this
 510         * bit is supported hence speeding up Xen clocksource.
 511         */
 512        pvti = &__this_cpu_read(xen_vcpu)->time;
 513        if (pvti->flags & PVCLOCK_TSC_STABLE_BIT) {
 514                pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
 515                xen_setup_vsyscall_time_info();
 516        }
 517
 518        xen_setup_runstate_info(cpu);
 519        xen_setup_timer(cpu);
 520        xen_setup_cpu_clockevents();
 521
 522        xen_time_setup_guest();
 523
 524        if (xen_initial_domain())
 525                pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier);
 526}
 527
 528void __init xen_init_time_ops(void)
 529{
 530        xen_sched_clock_offset = xen_clocksource_read();
 531        pv_ops.time = xen_time_ops;
 532
 533        x86_init.timers.timer_init = xen_time_init;
 534        x86_init.timers.setup_percpu_clockev = x86_init_noop;
 535        x86_cpuinit.setup_percpu_clockev = x86_init_noop;
 536
 537        x86_platform.calibrate_tsc = xen_tsc_khz;
 538        x86_platform.get_wallclock = xen_get_wallclock;
 539        /* Dom0 uses the native method to set the hardware RTC. */
 540        if (!xen_initial_domain())
 541                x86_platform.set_wallclock = xen_set_wallclock;
 542}
 543
 544#ifdef CONFIG_XEN_PVHVM
 545static void xen_hvm_setup_cpu_clockevents(void)
 546{
 547        int cpu = smp_processor_id();
 548        xen_setup_runstate_info(cpu);
 549        /*
 550         * xen_setup_timer(cpu) - snprintf is bad in atomic context. Hence
 551         * doing it xen_hvm_cpu_notify (which gets called by smp_init during
 552         * early bootup and also during CPU hotplug events).
 553         */
 554        xen_setup_cpu_clockevents();
 555}
 556
 557void __init xen_hvm_init_time_ops(void)
 558{
 559        /*
 560         * vector callback is needed otherwise we cannot receive interrupts
 561         * on cpu > 0 and at this point we don't know how many cpus are
 562         * available.
 563         */
 564        if (!xen_have_vector_callback)
 565                return;
 566
 567        if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
 568                pr_info("Xen doesn't support pvclock on HVM, disable pv timer");
 569                return;
 570        }
 571
 572        xen_sched_clock_offset = xen_clocksource_read();
 573        pv_ops.time = xen_time_ops;
 574        x86_init.timers.setup_percpu_clockev = xen_time_init;
 575        x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
 576
 577        x86_platform.calibrate_tsc = xen_tsc_khz;
 578        x86_platform.get_wallclock = xen_get_wallclock;
 579        x86_platform.set_wallclock = xen_set_wallclock;
 580}
 581#endif
 582
 583/* Kernel parameter to specify Xen timer slop */
 584static int __init parse_xen_timer_slop(char *ptr)
 585{
 586        unsigned long slop = memparse(ptr, NULL);
 587
 588        xen_timerop_clockevent.min_delta_ns = slop;
 589        xen_timerop_clockevent.min_delta_ticks = slop;
 590        xen_vcpuop_clockevent.min_delta_ns = slop;
 591        xen_vcpuop_clockevent.min_delta_ticks = slop;
 592
 593        return 0;
 594}
 595early_param("xen_timer_slop", parse_xen_timer_slop);
 596