linux/arch/x86/xen/time.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Xen time implementation.
   4 *
   5 * This is implemented in terms of a clocksource driver which uses
   6 * the hypervisor clock as a nanosecond timebase, and a clockevent
   7 * driver which uses the hypervisor's timer mechanism.
   8 *
   9 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  10 */
  11#include <linux/kernel.h>
  12#include <linux/interrupt.h>
  13#include <linux/clocksource.h>
  14#include <linux/clockchips.h>
  15#include <linux/gfp.h>
  16#include <linux/slab.h>
  17#include <linux/pvclock_gtod.h>
  18#include <linux/timekeeper_internal.h>
  19
  20#include <asm/pvclock.h>
  21#include <asm/xen/hypervisor.h>
  22#include <asm/xen/hypercall.h>
  23
  24#include <xen/events.h>
  25#include <xen/features.h>
  26#include <xen/interface/xen.h>
  27#include <xen/interface/vcpu.h>
  28
  29#include "xen-ops.h"
  30
  31/* Minimum amount of time until next clock event fires */
  32#define TIMER_SLOP      100000
  33
  34static u64 xen_sched_clock_offset __read_mostly;
  35
  36/* Get the TSC speed from Xen */
  37static unsigned long xen_tsc_khz(void)
  38{
  39        struct pvclock_vcpu_time_info *info =
  40                &HYPERVISOR_shared_info->vcpu_info[0].time;
  41
  42        return pvclock_tsc_khz(info);
  43}
  44
  45static u64 xen_clocksource_read(void)
  46{
  47        struct pvclock_vcpu_time_info *src;
  48        u64 ret;
  49
  50        preempt_disable_notrace();
  51        src = &__this_cpu_read(xen_vcpu)->time;
  52        ret = pvclock_clocksource_read(src);
  53        preempt_enable_notrace();
  54        return ret;
  55}
  56
  57static u64 xen_clocksource_get_cycles(struct clocksource *cs)
  58{
  59        return xen_clocksource_read();
  60}
  61
  62static u64 xen_sched_clock(void)
  63{
  64        return xen_clocksource_read() - xen_sched_clock_offset;
  65}
  66
  67static void xen_read_wallclock(struct timespec64 *ts)
  68{
  69        struct shared_info *s = HYPERVISOR_shared_info;
  70        struct pvclock_wall_clock *wall_clock = &(s->wc);
  71        struct pvclock_vcpu_time_info *vcpu_time;
  72
  73        vcpu_time = &get_cpu_var(xen_vcpu)->time;
  74        pvclock_read_wallclock(wall_clock, vcpu_time, ts);
  75        put_cpu_var(xen_vcpu);
  76}
  77
  78static void xen_get_wallclock(struct timespec64 *now)
  79{
  80        xen_read_wallclock(now);
  81}
  82
  83static int xen_set_wallclock(const struct timespec64 *now)
  84{
  85        return -ENODEV;
  86}
  87
  88static int xen_pvclock_gtod_notify(struct notifier_block *nb,
  89                                   unsigned long was_set, void *priv)
  90{
  91        /* Protected by the calling core code serialization */
  92        static struct timespec64 next_sync;
  93
  94        struct xen_platform_op op;
  95        struct timespec64 now;
  96        struct timekeeper *tk = priv;
  97        static bool settime64_supported = true;
  98        int ret;
  99
 100        now.tv_sec = tk->xtime_sec;
 101        now.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
 102
 103        /*
 104         * We only take the expensive HV call when the clock was set
 105         * or when the 11 minutes RTC synchronization time elapsed.
 106         */
 107        if (!was_set && timespec64_compare(&now, &next_sync) < 0)
 108                return NOTIFY_OK;
 109
 110again:
 111        if (settime64_supported) {
 112                op.cmd = XENPF_settime64;
 113                op.u.settime64.mbz = 0;
 114                op.u.settime64.secs = now.tv_sec;
 115                op.u.settime64.nsecs = now.tv_nsec;
 116                op.u.settime64.system_time = xen_clocksource_read();
 117        } else {
 118                op.cmd = XENPF_settime32;
 119                op.u.settime32.secs = now.tv_sec;
 120                op.u.settime32.nsecs = now.tv_nsec;
 121                op.u.settime32.system_time = xen_clocksource_read();
 122        }
 123
 124        ret = HYPERVISOR_platform_op(&op);
 125
 126        if (ret == -ENOSYS && settime64_supported) {
 127                settime64_supported = false;
 128                goto again;
 129        }
 130        if (ret < 0)
 131                return NOTIFY_BAD;
 132
 133        /*
 134         * Move the next drift compensation time 11 minutes
 135         * ahead. That's emulating the sync_cmos_clock() update for
 136         * the hardware RTC.
 137         */
 138        next_sync = now;
 139        next_sync.tv_sec += 11 * 60;
 140
 141        return NOTIFY_OK;
 142}
 143
 144static struct notifier_block xen_pvclock_gtod_notifier = {
 145        .notifier_call = xen_pvclock_gtod_notify,
 146};
 147
 148static struct clocksource xen_clocksource __read_mostly = {
 149        .name = "xen",
 150        .rating = 400,
 151        .read = xen_clocksource_get_cycles,
 152        .mask = ~0,
 153        .flags = CLOCK_SOURCE_IS_CONTINUOUS,
 154};
 155
 156/*
 157   Xen clockevent implementation
 158
 159   Xen has two clockevent implementations:
 160
 161   The old timer_op one works with all released versions of Xen prior
 162   to version 3.0.4.  This version of the hypervisor provides a
 163   single-shot timer with nanosecond resolution.  However, sharing the
 164   same event channel is a 100Hz tick which is delivered while the
 165   vcpu is running.  We don't care about or use this tick, but it will
 166   cause the core time code to think the timer fired too soon, and
 167   will end up resetting it each time.  It could be filtered, but
 168   doing so has complications when the ktime clocksource is not yet
 169   the xen clocksource (ie, at boot time).
 170
 171   The new vcpu_op-based timer interface allows the tick timer period
 172   to be changed or turned off.  The tick timer is not useful as a
 173   periodic timer because events are only delivered to running vcpus.
 174   The one-shot timer can report when a timeout is in the past, so
 175   set_next_event is capable of returning -ETIME when appropriate.
 176   This interface is used when available.
 177*/
 178
 179
 180/*
 181  Get a hypervisor absolute time.  In theory we could maintain an
 182  offset between the kernel's time and the hypervisor's time, and
 183  apply that to a kernel's absolute timeout.  Unfortunately the
 184  hypervisor and kernel times can drift even if the kernel is using
 185  the Xen clocksource, because ntp can warp the kernel's clocksource.
 186*/
 187static s64 get_abs_timeout(unsigned long delta)
 188{
 189        return xen_clocksource_read() + delta;
 190}
 191
 192static int xen_timerop_shutdown(struct clock_event_device *evt)
 193{
 194        /* cancel timeout */
 195        HYPERVISOR_set_timer_op(0);
 196
 197        return 0;
 198}
 199
 200static int xen_timerop_set_next_event(unsigned long delta,
 201                                      struct clock_event_device *evt)
 202{
 203        WARN_ON(!clockevent_state_oneshot(evt));
 204
 205        if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
 206                BUG();
 207
 208        /* We may have missed the deadline, but there's no real way of
 209           knowing for sure.  If the event was in the past, then we'll
 210           get an immediate interrupt. */
 211
 212        return 0;
 213}
 214
 215static struct clock_event_device xen_timerop_clockevent __ro_after_init = {
 216        .name                   = "xen",
 217        .features               = CLOCK_EVT_FEAT_ONESHOT,
 218
 219        .max_delta_ns           = 0xffffffff,
 220        .max_delta_ticks        = 0xffffffff,
 221        .min_delta_ns           = TIMER_SLOP,
 222        .min_delta_ticks        = TIMER_SLOP,
 223
 224        .mult                   = 1,
 225        .shift                  = 0,
 226        .rating                 = 500,
 227
 228        .set_state_shutdown     = xen_timerop_shutdown,
 229        .set_next_event         = xen_timerop_set_next_event,
 230};
 231
 232static int xen_vcpuop_shutdown(struct clock_event_device *evt)
 233{
 234        int cpu = smp_processor_id();
 235
 236        if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, xen_vcpu_nr(cpu),
 237                               NULL) ||
 238            HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu),
 239                               NULL))
 240                BUG();
 241
 242        return 0;
 243}
 244
 245static int xen_vcpuop_set_oneshot(struct clock_event_device *evt)
 246{
 247        int cpu = smp_processor_id();
 248
 249        if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu),
 250                               NULL))
 251                BUG();
 252
 253        return 0;
 254}
 255
 256static int xen_vcpuop_set_next_event(unsigned long delta,
 257                                     struct clock_event_device *evt)
 258{
 259        int cpu = smp_processor_id();
 260        struct vcpu_set_singleshot_timer single;
 261        int ret;
 262
 263        WARN_ON(!clockevent_state_oneshot(evt));
 264
 265        single.timeout_abs_ns = get_abs_timeout(delta);
 266        /* Get an event anyway, even if the timeout is already expired */
 267        single.flags = 0;
 268
 269        ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, xen_vcpu_nr(cpu),
 270                                 &single);
 271        BUG_ON(ret != 0);
 272
 273        return ret;
 274}
 275
 276static struct clock_event_device xen_vcpuop_clockevent __ro_after_init = {
 277        .name = "xen",
 278        .features = CLOCK_EVT_FEAT_ONESHOT,
 279
 280        .max_delta_ns = 0xffffffff,
 281        .max_delta_ticks = 0xffffffff,
 282        .min_delta_ns = TIMER_SLOP,
 283        .min_delta_ticks = TIMER_SLOP,
 284
 285        .mult = 1,
 286        .shift = 0,
 287        .rating = 500,
 288
 289        .set_state_shutdown = xen_vcpuop_shutdown,
 290        .set_state_oneshot = xen_vcpuop_set_oneshot,
 291        .set_next_event = xen_vcpuop_set_next_event,
 292};
 293
 294static const struct clock_event_device *xen_clockevent =
 295        &xen_timerop_clockevent;
 296
 297struct xen_clock_event_device {
 298        struct clock_event_device evt;
 299        char name[16];
 300};
 301static DEFINE_PER_CPU(struct xen_clock_event_device, xen_clock_events) = { .evt.irq = -1 };
 302
 303static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
 304{
 305        struct clock_event_device *evt = this_cpu_ptr(&xen_clock_events.evt);
 306        irqreturn_t ret;
 307
 308        ret = IRQ_NONE;
 309        if (evt->event_handler) {
 310                evt->event_handler(evt);
 311                ret = IRQ_HANDLED;
 312        }
 313
 314        return ret;
 315}
 316
 317void xen_teardown_timer(int cpu)
 318{
 319        struct clock_event_device *evt;
 320        evt = &per_cpu(xen_clock_events, cpu).evt;
 321
 322        if (evt->irq >= 0) {
 323                unbind_from_irqhandler(evt->irq, NULL);
 324                evt->irq = -1;
 325        }
 326}
 327
 328void xen_setup_timer(int cpu)
 329{
 330        struct xen_clock_event_device *xevt = &per_cpu(xen_clock_events, cpu);
 331        struct clock_event_device *evt = &xevt->evt;
 332        int irq;
 333
 334        WARN(evt->irq >= 0, "IRQ%d for CPU%d is already allocated\n", evt->irq, cpu);
 335        if (evt->irq >= 0)
 336                xen_teardown_timer(cpu);
 337
 338        printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
 339
 340        snprintf(xevt->name, sizeof(xevt->name), "timer%d", cpu);
 341
 342        irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
 343                                      IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER|
 344                                      IRQF_FORCE_RESUME|IRQF_EARLY_RESUME,
 345                                      xevt->name, NULL);
 346        (void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX);
 347
 348        memcpy(evt, xen_clockevent, sizeof(*evt));
 349
 350        evt->cpumask = cpumask_of(cpu);
 351        evt->irq = irq;
 352}
 353
 354
 355void xen_setup_cpu_clockevents(void)
 356{
 357        clockevents_register_device(this_cpu_ptr(&xen_clock_events.evt));
 358}
 359
 360void xen_timer_resume(void)
 361{
 362        int cpu;
 363
 364        if (xen_clockevent != &xen_vcpuop_clockevent)
 365                return;
 366
 367        for_each_online_cpu(cpu) {
 368                if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer,
 369                                       xen_vcpu_nr(cpu), NULL))
 370                        BUG();
 371        }
 372}
 373
 374static const struct pv_time_ops xen_time_ops __initconst = {
 375        .sched_clock = xen_sched_clock,
 376        .steal_clock = xen_steal_clock,
 377};
 378
 379static struct pvclock_vsyscall_time_info *xen_clock __read_mostly;
 380static u64 xen_clock_value_saved;
 381
 382void xen_save_time_memory_area(void)
 383{
 384        struct vcpu_register_time_memory_area t;
 385        int ret;
 386
 387        xen_clock_value_saved = xen_clocksource_read() - xen_sched_clock_offset;
 388
 389        if (!xen_clock)
 390                return;
 391
 392        t.addr.v = NULL;
 393
 394        ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
 395        if (ret != 0)
 396                pr_notice("Cannot save secondary vcpu_time_info (err %d)",
 397                          ret);
 398        else
 399                clear_page(xen_clock);
 400}
 401
 402void xen_restore_time_memory_area(void)
 403{
 404        struct vcpu_register_time_memory_area t;
 405        int ret;
 406
 407        if (!xen_clock)
 408                goto out;
 409
 410        t.addr.v = &xen_clock->pvti;
 411
 412        ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
 413
 414        /*
 415         * We don't disable VCLOCK_PVCLOCK entirely if it fails to register the
 416         * secondary time info with Xen or if we migrated to a host without the
 417         * necessary flags. On both of these cases what happens is either
 418         * process seeing a zeroed out pvti or seeing no PVCLOCK_TSC_STABLE_BIT
 419         * bit set. Userspace checks the latter and if 0, it discards the data
 420         * in pvti and fallbacks to a system call for a reliable timestamp.
 421         */
 422        if (ret != 0)
 423                pr_notice("Cannot restore secondary vcpu_time_info (err %d)",
 424                          ret);
 425
 426out:
 427        /* Need pvclock_resume() before using xen_clocksource_read(). */
 428        pvclock_resume();
 429        xen_sched_clock_offset = xen_clocksource_read() - xen_clock_value_saved;
 430}
 431
 432static void xen_setup_vsyscall_time_info(void)
 433{
 434        struct vcpu_register_time_memory_area t;
 435        struct pvclock_vsyscall_time_info *ti;
 436        int ret;
 437
 438        ti = (struct pvclock_vsyscall_time_info *)get_zeroed_page(GFP_KERNEL);
 439        if (!ti)
 440                return;
 441
 442        t.addr.v = &ti->pvti;
 443
 444        ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
 445        if (ret) {
 446                pr_notice("xen: VCLOCK_PVCLOCK not supported (err %d)\n", ret);
 447                free_page((unsigned long)ti);
 448                return;
 449        }
 450
 451        /*
 452         * If primary time info had this bit set, secondary should too since
 453         * it's the same data on both just different memory regions. But we
 454         * still check it in case hypervisor is buggy.
 455         */
 456        if (!(ti->pvti.flags & PVCLOCK_TSC_STABLE_BIT)) {
 457                t.addr.v = NULL;
 458                ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area,
 459                                         0, &t);
 460                if (!ret)
 461                        free_page((unsigned long)ti);
 462
 463                pr_notice("xen: VCLOCK_PVCLOCK not supported (tsc unstable)\n");
 464                return;
 465        }
 466
 467        xen_clock = ti;
 468        pvclock_set_pvti_cpu0_va(xen_clock);
 469
 470        xen_clocksource.archdata.vclock_mode = VCLOCK_PVCLOCK;
 471}
 472
 473static void __init xen_time_init(void)
 474{
 475        struct pvclock_vcpu_time_info *pvti;
 476        int cpu = smp_processor_id();
 477        struct timespec64 tp;
 478
 479        /* As Dom0 is never moved, no penalty on using TSC there */
 480        if (xen_initial_domain())
 481                xen_clocksource.rating = 275;
 482
 483        clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
 484
 485        if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu),
 486                               NULL) == 0) {
 487                /* Successfully turned off 100Hz tick, so we have the
 488                   vcpuop-based timer interface */
 489                printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
 490                xen_clockevent = &xen_vcpuop_clockevent;
 491        }
 492
 493        /* Set initial system time with full resolution */
 494        xen_read_wallclock(&tp);
 495        do_settimeofday64(&tp);
 496
 497        setup_force_cpu_cap(X86_FEATURE_TSC);
 498
 499        /*
 500         * We check ahead on the primary time info if this
 501         * bit is supported hence speeding up Xen clocksource.
 502         */
 503        pvti = &__this_cpu_read(xen_vcpu)->time;
 504        if (pvti->flags & PVCLOCK_TSC_STABLE_BIT) {
 505                pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
 506                xen_setup_vsyscall_time_info();
 507        }
 508
 509        xen_setup_runstate_info(cpu);
 510        xen_setup_timer(cpu);
 511        xen_setup_cpu_clockevents();
 512
 513        xen_time_setup_guest();
 514
 515        if (xen_initial_domain())
 516                pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier);
 517}
 518
 519void __init xen_init_time_ops(void)
 520{
 521        xen_sched_clock_offset = xen_clocksource_read();
 522        pv_ops.time = xen_time_ops;
 523
 524        x86_init.timers.timer_init = xen_time_init;
 525        x86_init.timers.setup_percpu_clockev = x86_init_noop;
 526        x86_cpuinit.setup_percpu_clockev = x86_init_noop;
 527
 528        x86_platform.calibrate_tsc = xen_tsc_khz;
 529        x86_platform.get_wallclock = xen_get_wallclock;
 530        /* Dom0 uses the native method to set the hardware RTC. */
 531        if (!xen_initial_domain())
 532                x86_platform.set_wallclock = xen_set_wallclock;
 533}
 534
 535#ifdef CONFIG_XEN_PVHVM
 536static void xen_hvm_setup_cpu_clockevents(void)
 537{
 538        int cpu = smp_processor_id();
 539        xen_setup_runstate_info(cpu);
 540        /*
 541         * xen_setup_timer(cpu) - snprintf is bad in atomic context. Hence
 542         * doing it xen_hvm_cpu_notify (which gets called by smp_init during
 543         * early bootup and also during CPU hotplug events).
 544         */
 545        xen_setup_cpu_clockevents();
 546}
 547
 548void __init xen_hvm_init_time_ops(void)
 549{
 550        /*
 551         * vector callback is needed otherwise we cannot receive interrupts
 552         * on cpu > 0 and at this point we don't know how many cpus are
 553         * available.
 554         */
 555        if (!xen_have_vector_callback)
 556                return;
 557
 558        if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
 559                pr_info("Xen doesn't support pvclock on HVM, disable pv timer");
 560                return;
 561        }
 562
 563        xen_sched_clock_offset = xen_clocksource_read();
 564        pv_ops.time = xen_time_ops;
 565        x86_init.timers.setup_percpu_clockev = xen_time_init;
 566        x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
 567
 568        x86_platform.calibrate_tsc = xen_tsc_khz;
 569        x86_platform.get_wallclock = xen_get_wallclock;
 570        x86_platform.set_wallclock = xen_set_wallclock;
 571}
 572#endif
 573
 574/* Kernel parameter to specify Xen timer slop */
 575static int __init parse_xen_timer_slop(char *ptr)
 576{
 577        unsigned long slop = memparse(ptr, NULL);
 578
 579        xen_timerop_clockevent.min_delta_ns = slop;
 580        xen_timerop_clockevent.min_delta_ticks = slop;
 581        xen_vcpuop_clockevent.min_delta_ns = slop;
 582        xen_vcpuop_clockevent.min_delta_ticks = slop;
 583
 584        return 0;
 585}
 586early_param("xen_timer_slop", parse_xen_timer_slop);
 587