linux/arch/x86/kernel/vsyscall_64.c
<<
>>
Prefs
   1/*
   2 *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
   3 *  Copyright 2003 Andi Kleen, SuSE Labs.
   4 *
   5 *  Thanks to hpa@transmeta.com for some useful hint.
   6 *  Special thanks to Ingo Molnar for his early experience with
   7 *  a different vsyscall implementation for Linux/IA32 and for the name.
   8 *
   9 *  vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
  10 *  at virtual address -10Mbyte+1024bytes etc... There are at max 4
  11 *  vsyscalls. One vsyscall can reserve more than 1 slot to avoid
  12 *  jumping out of line if necessary. We cannot add more with this
  13 *  mechanism because older kernels won't return -ENOSYS.
  14 *  If we want more than four we need a vDSO.
  15 *
  16 *  Note: the concept clashes with user mode linux. If you use UML and
  17 *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
  18 */
  19
  20/* Disable profiling for userspace code: */
  21#define DISABLE_BRANCH_PROFILING
  22
  23#include <linux/time.h>
  24#include <linux/init.h>
  25#include <linux/kernel.h>
  26#include <linux/timer.h>
  27#include <linux/seqlock.h>
  28#include <linux/jiffies.h>
  29#include <linux/sysctl.h>
  30#include <linux/clocksource.h>
  31#include <linux/getcpu.h>
  32#include <linux/cpu.h>
  33#include <linux/smp.h>
  34#include <linux/notifier.h>
  35
  36#include <asm/vsyscall.h>
  37#include <asm/pgtable.h>
  38#include <asm/page.h>
  39#include <asm/unistd.h>
  40#include <asm/fixmap.h>
  41#include <asm/errno.h>
  42#include <asm/io.h>
  43#include <asm/segment.h>
  44#include <asm/desc.h>
  45#include <asm/topology.h>
  46#include <asm/vgtod.h>
  47
  48#define __vsyscall(nr) \
  49                __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
  50#define __syscall_clobber "r11","cx","memory"
  51
  52DEFINE_VVAR(int, vgetcpu_mode);
  53DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
  54{
  55        .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
  56        .sysctl_enabled = 1,
  57};
  58
  59void update_vsyscall_tz(void)
  60{
  61        unsigned long flags;
  62
  63        write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
  64        /* sys_tz has changed */
  65        vsyscall_gtod_data.sys_tz = sys_tz;
  66        write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
  67}
  68
  69void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
  70                        struct clocksource *clock, u32 mult)
  71{
  72        unsigned long flags;
  73
  74        write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
  75        /* copy vsyscall data */
  76        vsyscall_gtod_data.clock.vread = clock->vread;
  77        vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
  78        vsyscall_gtod_data.clock.mask = clock->mask;
  79        vsyscall_gtod_data.clock.mult = mult;
  80        vsyscall_gtod_data.clock.shift = clock->shift;
  81        vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
  82        vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
  83        vsyscall_gtod_data.wall_to_monotonic = *wtm;
  84        vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
  85        write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
  86}
  87
  88/* RED-PEN may want to readd seq locking, but then the variable should be
  89 * write-once.
  90 */
  91static __always_inline void do_get_tz(struct timezone * tz)
  92{
  93        *tz = VVAR(vsyscall_gtod_data).sys_tz;
  94}
  95
  96static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
  97{
  98        int ret;
  99        asm volatile("syscall"
 100                : "=a" (ret)
 101                : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
 102                : __syscall_clobber );
 103        return ret;
 104}
 105
 106static __always_inline long time_syscall(long *t)
 107{
 108        long secs;
 109        asm volatile("syscall"
 110                : "=a" (secs)
 111                : "0" (__NR_time),"D" (t) : __syscall_clobber);
 112        return secs;
 113}
 114
 115static __always_inline void do_vgettimeofday(struct timeval * tv)
 116{
 117        cycle_t now, base, mask, cycle_delta;
 118        unsigned seq;
 119        unsigned long mult, shift, nsec;
 120        cycle_t (*vread)(void);
 121        do {
 122                seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock);
 123
 124                vread = VVAR(vsyscall_gtod_data).clock.vread;
 125                if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled ||
 126                             !vread)) {
 127                        gettimeofday(tv,NULL);
 128                        return;
 129                }
 130
 131                now = vread();
 132                base = VVAR(vsyscall_gtod_data).clock.cycle_last;
 133                mask = VVAR(vsyscall_gtod_data).clock.mask;
 134                mult = VVAR(vsyscall_gtod_data).clock.mult;
 135                shift = VVAR(vsyscall_gtod_data).clock.shift;
 136
 137                tv->tv_sec = VVAR(vsyscall_gtod_data).wall_time_sec;
 138                nsec = VVAR(vsyscall_gtod_data).wall_time_nsec;
 139        } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq));
 140
 141        /* calculate interval: */
 142        cycle_delta = (now - base) & mask;
 143        /* convert to nsecs: */
 144        nsec += (cycle_delta * mult) >> shift;
 145
 146        while (nsec >= NSEC_PER_SEC) {
 147                tv->tv_sec += 1;
 148                nsec -= NSEC_PER_SEC;
 149        }
 150        tv->tv_usec = nsec / NSEC_PER_USEC;
 151}
 152
 153int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
 154{
 155        if (tv)
 156                do_vgettimeofday(tv);
 157        if (tz)
 158                do_get_tz(tz);
 159        return 0;
 160}
 161
 162/* This will break when the xtime seconds get inaccurate, but that is
 163 * unlikely */
 164time_t __vsyscall(1) vtime(time_t *t)
 165{
 166        unsigned seq;
 167        time_t result;
 168        if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled))
 169                return time_syscall(t);
 170
 171        do {
 172                seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock);
 173
 174                result = VVAR(vsyscall_gtod_data).wall_time_sec;
 175
 176        } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq));
 177
 178        if (t)
 179                *t = result;
 180        return result;
 181}
 182
 183/* Fast way to get current CPU and node.
 184   This helps to do per node and per CPU caches in user space.
 185   The result is not guaranteed without CPU affinity, but usually
 186   works out because the scheduler tries to keep a thread on the same
 187   CPU.
 188
 189   tcache must point to a two element sized long array.
 190   All arguments can be NULL. */
 191long __vsyscall(2)
 192vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
 193{
 194        unsigned int p;
 195        unsigned long j = 0;
 196
 197        /* Fast cache - only recompute value once per jiffies and avoid
 198           relatively costly rdtscp/cpuid otherwise.
 199           This works because the scheduler usually keeps the process
 200           on the same CPU and this syscall doesn't guarantee its
 201           results anyways.
 202           We do this here because otherwise user space would do it on
 203           its own in a likely inferior way (no access to jiffies).
 204           If you don't like it pass NULL. */
 205        if (tcache && tcache->blob[0] == (j = VVAR(jiffies))) {
 206                p = tcache->blob[1];
 207        } else if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) {
 208                /* Load per CPU data from RDTSCP */
 209                native_read_tscp(&p);
 210        } else {
 211                /* Load per CPU data from GDT */
 212                asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
 213        }
 214        if (tcache) {
 215                tcache->blob[0] = j;
 216                tcache->blob[1] = p;
 217        }
 218        if (cpu)
 219                *cpu = p & 0xfff;
 220        if (node)
 221                *node = p >> 12;
 222        return 0;
 223}
 224
 225static long __vsyscall(3) venosys_1(void)
 226{
 227        return -ENOSYS;
 228}
 229
 230#ifdef CONFIG_SYSCTL
 231static ctl_table kernel_table2[] = {
 232        { .procname = "vsyscall64",
 233          .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
 234          .mode = 0644,
 235          .proc_handler = proc_dointvec },
 236        {}
 237};
 238
 239static ctl_table kernel_root_table2[] = {
 240        { .procname = "kernel", .mode = 0555,
 241          .child = kernel_table2 },
 242        {}
 243};
 244#endif
 245
 246/* Assume __initcall executes before all user space. Hopefully kmod
 247   doesn't violate that. We'll find out if it does. */
 248static void __cpuinit vsyscall_set_cpu(int cpu)
 249{
 250        unsigned long d;
 251        unsigned long node = 0;
 252#ifdef CONFIG_NUMA
 253        node = cpu_to_node(cpu);
 254#endif
 255        if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
 256                write_rdtscp_aux((node << 12) | cpu);
 257
 258        /* Store cpu number in limit so that it can be loaded quickly
 259           in user space in vgetcpu.
 260           12 bits for the CPU and 8 bits for the node. */
 261        d = 0x0f40000000000ULL;
 262        d |= cpu;
 263        d |= (node & 0xf) << 12;
 264        d |= (node >> 4) << 48;
 265        write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
 266}
 267
 268static void __cpuinit cpu_vsyscall_init(void *arg)
 269{
 270        /* preemption should be already off */
 271        vsyscall_set_cpu(raw_smp_processor_id());
 272}
 273
 274static int __cpuinit
 275cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
 276{
 277        long cpu = (long)arg;
 278        if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
 279                smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
 280        return NOTIFY_DONE;
 281}
 282
 283void __init map_vsyscall(void)
 284{
 285        extern char __vsyscall_0;
 286        unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
 287
 288        /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
 289        __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
 290}
 291
 292static int __init vsyscall_init(void)
 293{
 294        BUG_ON(((unsigned long) &vgettimeofday !=
 295                        VSYSCALL_ADDR(__NR_vgettimeofday)));
 296        BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
 297        BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
 298        BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
 299#ifdef CONFIG_SYSCTL
 300        register_sysctl_table(kernel_root_table2);
 301#endif
 302        on_each_cpu(cpu_vsyscall_init, NULL, 1);
 303        /* notifier priority > KVM */
 304        hotcpu_notifier(cpu_vsyscall_notifier, 30);
 305        return 0;
 306}
 307
 308__initcall(vsyscall_init);
 309