linux/arch/x86/mm/tlb.c
<<
>>
Prefs
   1#include <linux/init.h>
   2
   3#include <linux/mm.h>
   4#include <linux/spinlock.h>
   5#include <linux/smp.h>
   6#include <linux/interrupt.h>
   7#include <linux/export.h>
   8#include <linux/cpu.h>
   9#include <linux/debugfs.h>
  10
  11#include <asm/tlbflush.h>
  12#include <asm/mmu_context.h>
  13#include <asm/nospec-branch.h>
  14#include <asm/cache.h>
  15#include <asm/apic.h>
  16#include <asm/uv/uv.h>
  17
  18/*
  19 *      TLB flushing, formerly SMP-only
  20 *              c/o Linus Torvalds.
  21 *
  22 *      These mean you can really definitely utterly forget about
  23 *      writing to user space from interrupts. (Its not allowed anyway).
  24 *
  25 *      Optimizations Manfred Spraul <manfred@colorfullife.com>
  26 *
  27 *      More scalable flush, from Andi Kleen
  28 *
  29 *      Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
  30 */
  31
  32/*
  33 * We get here when we do something requiring a TLB invalidation
  34 * but could not go invalidate all of the contexts.  We do the
  35 * necessary invalidation by clearing out the 'ctx_id' which
  36 * forces a TLB flush when the context is loaded.
  37 */
  38void clear_asid_other(void)
  39{
  40        u16 asid;
  41
  42        /*
  43         * This is only expected to be set if we have disabled
  44         * kernel _PAGE_GLOBAL pages.
  45         */
  46        if (!static_cpu_has(X86_FEATURE_PTI)) {
  47                WARN_ON_ONCE(1);
  48                return;
  49        }
  50
  51        for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
  52                /* Do not need to flush the current asid */
  53                if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid))
  54                        continue;
  55                /*
  56                 * Make sure the next time we go to switch to
  57                 * this asid, we do a flush:
  58                 */
  59                this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0);
  60        }
  61        this_cpu_write(cpu_tlbstate.invalidate_other, false);
  62}
  63
  64atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
  65
  66
  67static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
  68                            u16 *new_asid, bool *need_flush)
  69{
  70        u16 asid;
  71
  72        if (!static_cpu_has(X86_FEATURE_PCID)) {
  73                *new_asid = 0;
  74                *need_flush = true;
  75                return;
  76        }
  77
  78        if (this_cpu_read(cpu_tlbstate.invalidate_other))
  79                clear_asid_other();
  80
  81        for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
  82                if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
  83                    next->context.ctx_id)
  84                        continue;
  85
  86                *new_asid = asid;
  87                *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) <
  88                               next_tlb_gen);
  89                return;
  90        }
  91
  92        /*
  93         * We don't currently own an ASID slot on this CPU.
  94         * Allocate a slot.
  95         */
  96        *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1;
  97        if (*new_asid >= TLB_NR_DYN_ASIDS) {
  98                *new_asid = 0;
  99                this_cpu_write(cpu_tlbstate.next_asid, 1);
 100        }
 101        *need_flush = true;
 102}
 103
 104static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
 105{
 106        unsigned long new_mm_cr3;
 107
 108        if (need_flush) {
 109                invalidate_user_asid(new_asid);
 110                new_mm_cr3 = build_cr3(pgdir, new_asid);
 111        } else {
 112                new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
 113        }
 114
 115        /*
 116         * Caution: many callers of this function expect
 117         * that load_cr3() is serializing and orders TLB
 118         * fills with respect to the mm_cpumask writes.
 119         */
 120        write_cr3(new_mm_cr3);
 121}
 122
 123void leave_mm(int cpu)
 124{
 125        struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
 126
 127        /*
 128         * It's plausible that we're in lazy TLB mode while our mm is init_mm.
 129         * If so, our callers still expect us to flush the TLB, but there
 130         * aren't any user TLB entries in init_mm to worry about.
 131         *
 132         * This needs to happen before any other sanity checks due to
 133         * intel_idle's shenanigans.
 134         */
 135        if (loaded_mm == &init_mm)
 136                return;
 137
 138        /* Warn if we're not lazy. */
 139        WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy));
 140
 141        switch_mm(NULL, &init_mm, NULL);
 142}
 143EXPORT_SYMBOL_GPL(leave_mm);
 144
 145void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 146               struct task_struct *tsk)
 147{
 148        unsigned long flags;
 149
 150        local_irq_save(flags);
 151        switch_mm_irqs_off(prev, next, tsk);
 152        local_irq_restore(flags);
 153}
 154
 155static void sync_current_stack_to_mm(struct mm_struct *mm)
 156{
 157        unsigned long sp = current_stack_pointer;
 158        pgd_t *pgd = pgd_offset(mm, sp);
 159
 160        if (pgtable_l5_enabled()) {
 161                if (unlikely(pgd_none(*pgd))) {
 162                        pgd_t *pgd_ref = pgd_offset_k(sp);
 163
 164                        set_pgd(pgd, *pgd_ref);
 165                }
 166        } else {
 167                /*
 168                 * "pgd" is faked.  The top level entries are "p4d"s, so sync
 169                 * the p4d.  This compiles to approximately the same code as
 170                 * the 5-level case.
 171                 */
 172                p4d_t *p4d = p4d_offset(pgd, sp);
 173
 174                if (unlikely(p4d_none(*p4d))) {
 175                        pgd_t *pgd_ref = pgd_offset_k(sp);
 176                        p4d_t *p4d_ref = p4d_offset(pgd_ref, sp);
 177
 178                        set_p4d(p4d, *p4d_ref);
 179                }
 180        }
 181}
 182
 183void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 184                        struct task_struct *tsk)
 185{
 186        struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
 187        u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
 188        unsigned cpu = smp_processor_id();
 189        u64 next_tlb_gen;
 190
 191        /*
 192         * NB: The scheduler will call us with prev == next when switching
 193         * from lazy TLB mode to normal mode if active_mm isn't changing.
 194         * When this happens, we don't assume that CR3 (and hence
 195         * cpu_tlbstate.loaded_mm) matches next.
 196         *
 197         * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
 198         */
 199
 200        /* We don't want flush_tlb_func_* to run concurrently with us. */
 201        if (IS_ENABLED(CONFIG_PROVE_LOCKING))
 202                WARN_ON_ONCE(!irqs_disabled());
 203
 204        /*
 205         * Verify that CR3 is what we think it is.  This will catch
 206         * hypothetical buggy code that directly switches to swapper_pg_dir
 207         * without going through leave_mm() / switch_mm_irqs_off() or that
 208         * does something like write_cr3(read_cr3_pa()).
 209         *
 210         * Only do this check if CONFIG_DEBUG_VM=y because __read_cr3()
 211         * isn't free.
 212         */
 213#ifdef CONFIG_DEBUG_VM
 214        if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {
 215                /*
 216                 * If we were to BUG here, we'd be very likely to kill
 217                 * the system so hard that we don't see the call trace.
 218                 * Try to recover instead by ignoring the error and doing
 219                 * a global flush to minimize the chance of corruption.
 220                 *
 221                 * (This is far from being a fully correct recovery.
 222                 *  Architecturally, the CPU could prefetch something
 223                 *  back into an incorrect ASID slot and leave it there
 224                 *  to cause trouble down the road.  It's better than
 225                 *  nothing, though.)
 226                 */
 227                __flush_tlb_all();
 228        }
 229#endif
 230        this_cpu_write(cpu_tlbstate.is_lazy, false);
 231
 232        /*
 233         * The membarrier system call requires a full memory barrier and
 234         * core serialization before returning to user-space, after
 235         * storing to rq->curr. Writing to CR3 provides that full
 236         * memory barrier and core serializing instruction.
 237         */
 238        if (real_prev == next) {
 239                VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
 240                           next->context.ctx_id);
 241
 242                /*
 243                 * We don't currently support having a real mm loaded without
 244                 * our cpu set in mm_cpumask().  We have all the bookkeeping
 245                 * in place to figure out whether we would need to flush
 246                 * if our cpu were cleared in mm_cpumask(), but we don't
 247                 * currently use it.
 248                 */
 249                if (WARN_ON_ONCE(real_prev != &init_mm &&
 250                                 !cpumask_test_cpu(cpu, mm_cpumask(next))))
 251                        cpumask_set_cpu(cpu, mm_cpumask(next));
 252
 253                return;
 254        } else {
 255                u16 new_asid;
 256                bool need_flush;
 257                u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
 258
 259                /*
 260                 * Avoid user/user BTB poisoning by flushing the branch
 261                 * predictor when switching between processes. This stops
 262                 * one process from doing Spectre-v2 attacks on another.
 263                 *
 264                 * As an optimization, flush indirect branches only when
 265                 * switching into processes that disable dumping. This
 266                 * protects high value processes like gpg, without having
 267                 * too high performance overhead. IBPB is *expensive*!
 268                 *
 269                 * This will not flush branches when switching into kernel
 270                 * threads. It will also not flush if we switch to idle
 271                 * thread and back to the same process. It will flush if we
 272                 * switch to a different non-dumpable process.
 273                 */
 274                if (tsk && tsk->mm &&
 275                    tsk->mm->context.ctx_id != last_ctx_id &&
 276                    get_dumpable(tsk->mm) != SUID_DUMP_USER)
 277                        indirect_branch_prediction_barrier();
 278
 279                if (IS_ENABLED(CONFIG_VMAP_STACK)) {
 280                        /*
 281                         * If our current stack is in vmalloc space and isn't
 282                         * mapped in the new pgd, we'll double-fault.  Forcibly
 283                         * map it.
 284                         */
 285                        sync_current_stack_to_mm(next);
 286                }
 287
 288                /* Stop remote flushes for the previous mm */
 289                VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
 290                                real_prev != &init_mm);
 291                cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
 292
 293                /*
 294                 * Start remote flushes and then read tlb_gen.
 295                 */
 296                cpumask_set_cpu(cpu, mm_cpumask(next));
 297                next_tlb_gen = atomic64_read(&next->context.tlb_gen);
 298
 299                choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
 300
 301                if (need_flush) {
 302                        this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
 303                        this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
 304                        load_new_mm_cr3(next->pgd, new_asid, true);
 305
 306                        /*
 307                         * NB: This gets called via leave_mm() in the idle path
 308                         * where RCU functions differently.  Tracing normally
 309                         * uses RCU, so we need to use the _rcuidle variant.
 310                         *
 311                         * (There is no good reason for this.  The idle code should
 312                         *  be rearranged to call this before rcu_idle_enter().)
 313                         */
 314                        trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 315                } else {
 316                        /* The new ASID is already up to date. */
 317                        load_new_mm_cr3(next->pgd, new_asid, false);
 318
 319                        /* See above wrt _rcuidle. */
 320                        trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
 321                }
 322
 323                /*
 324                 * Record last user mm's context id, so we can avoid
 325                 * flushing branch buffer with IBPB if we switch back
 326                 * to the same user.
 327                 */
 328                if (next != &init_mm)
 329                        this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
 330
 331                this_cpu_write(cpu_tlbstate.loaded_mm, next);
 332                this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
 333        }
 334
 335        load_mm_cr4(next);
 336        switch_ldt(real_prev, next);
 337}
 338
 339/*
 340 * Please ignore the name of this function.  It should be called
 341 * switch_to_kernel_thread().
 342 *
 343 * enter_lazy_tlb() is a hint from the scheduler that we are entering a
 344 * kernel thread or other context without an mm.  Acceptable implementations
 345 * include doing nothing whatsoever, switching to init_mm, or various clever
 346 * lazy tricks to try to minimize TLB flushes.
 347 *
 348 * The scheduler reserves the right to call enter_lazy_tlb() several times
 349 * in a row.  It will notify us that we're going back to a real mm by
 350 * calling switch_mm_irqs_off().
 351 */
 352void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 353{
 354        if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
 355                return;
 356
 357        if (tlb_defer_switch_to_init_mm()) {
 358                /*
 359                 * There's a significant optimization that may be possible
 360                 * here.  We have accurate enough TLB flush tracking that we
 361                 * don't need to maintain coherence of TLB per se when we're
 362                 * lazy.  We do, however, need to maintain coherence of
 363                 * paging-structure caches.  We could, in principle, leave our
 364                 * old mm loaded and only switch to init_mm when
 365                 * tlb_remove_page() happens.
 366                 */
 367                this_cpu_write(cpu_tlbstate.is_lazy, true);
 368        } else {
 369                switch_mm(NULL, &init_mm, NULL);
 370        }
 371}
 372
 373/*
 374 * Call this when reinitializing a CPU.  It fixes the following potential
 375 * problems:
 376 *
 377 * - The ASID changed from what cpu_tlbstate thinks it is (most likely
 378 *   because the CPU was taken down and came back up with CR3's PCID
 379 *   bits clear.  CPU hotplug can do this.
 380 *
 381 * - The TLB contains junk in slots corresponding to inactive ASIDs.
 382 *
 383 * - The CPU went so far out to lunch that it may have missed a TLB
 384 *   flush.
 385 */
 386void initialize_tlbstate_and_flush(void)
 387{
 388        int i;
 389        struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm);
 390        u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen);
 391        unsigned long cr3 = __read_cr3();
 392
 393        /* Assert that CR3 already references the right mm. */
 394        WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));
 395
 396        /*
 397         * Assert that CR4.PCIDE is set if needed.  (CR4.PCIDE initialization
 398         * doesn't work like other CR4 bits because it can only be set from
 399         * long mode.)
 400         */
 401        WARN_ON(boot_cpu_has(X86_FEATURE_PCID) &&
 402                !(cr4_read_shadow() & X86_CR4_PCIDE));
 403
 404        /* Force ASID 0 and force a TLB flush. */
 405        write_cr3(build_cr3(mm->pgd, 0));
 406
 407        /* Reinitialize tlbstate. */
 408        this_cpu_write(cpu_tlbstate.last_ctx_id, mm->context.ctx_id);
 409        this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
 410        this_cpu_write(cpu_tlbstate.next_asid, 1);
 411        this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
 412        this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);
 413
 414        for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
 415                this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
 416}
 417
 418/*
 419 * flush_tlb_func_common()'s memory ordering requirement is that any
 420 * TLB fills that happen after we flush the TLB are ordered after we
 421 * read active_mm's tlb_gen.  We don't need any explicit barriers
 422 * because all x86 flush operations are serializing and the
 423 * atomic64_read operation won't be reordered by the compiler.
 424 */
 425static void flush_tlb_func_common(const struct flush_tlb_info *f,
 426                                  bool local, enum tlb_flush_reason reason)
 427{
 428        /*
 429         * We have three different tlb_gen values in here.  They are:
 430         *
 431         * - mm_tlb_gen:     the latest generation.
 432         * - local_tlb_gen:  the generation that this CPU has already caught
 433         *                   up to.
 434         * - f->new_tlb_gen: the generation that the requester of the flush
 435         *                   wants us to catch up to.
 436         */
 437        struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
 438        u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
 439        u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
 440        u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
 441
 442        /* This code cannot presently handle being reentered. */
 443        VM_WARN_ON(!irqs_disabled());
 444
 445        if (unlikely(loaded_mm == &init_mm))
 446                return;
 447
 448        VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
 449                   loaded_mm->context.ctx_id);
 450
 451        if (this_cpu_read(cpu_tlbstate.is_lazy)) {
 452                /*
 453                 * We're in lazy mode.  We need to at least flush our
 454                 * paging-structure cache to avoid speculatively reading
 455                 * garbage into our TLB.  Since switching to init_mm is barely
 456                 * slower than a minimal flush, just switch to init_mm.
 457                 */
 458                switch_mm_irqs_off(NULL, &init_mm, NULL);
 459                return;
 460        }
 461
 462        if (unlikely(local_tlb_gen == mm_tlb_gen)) {
 463                /*
 464                 * There's nothing to do: we're already up to date.  This can
 465                 * happen if two concurrent flushes happen -- the first flush to
 466                 * be handled can catch us all the way up, leaving no work for
 467                 * the second flush.
 468                 */
 469                trace_tlb_flush(reason, 0);
 470                return;
 471        }
 472
 473        WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
 474        WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen);
 475
 476        /*
 477         * If we get to this point, we know that our TLB is out of date.
 478         * This does not strictly imply that we need to flush (it's
 479         * possible that f->new_tlb_gen <= local_tlb_gen), but we're
 480         * going to need to flush in the very near future, so we might
 481         * as well get it over with.
 482         *
 483         * The only question is whether to do a full or partial flush.
 484         *
 485         * We do a partial flush if requested and two extra conditions
 486         * are met:
 487         *
 488         * 1. f->new_tlb_gen == local_tlb_gen + 1.  We have an invariant that
 489         *    we've always done all needed flushes to catch up to
 490         *    local_tlb_gen.  If, for example, local_tlb_gen == 2 and
 491         *    f->new_tlb_gen == 3, then we know that the flush needed to bring
 492         *    us up to date for tlb_gen 3 is the partial flush we're
 493         *    processing.
 494         *
 495         *    As an example of why this check is needed, suppose that there
 496         *    are two concurrent flushes.  The first is a full flush that
 497         *    changes context.tlb_gen from 1 to 2.  The second is a partial
 498         *    flush that changes context.tlb_gen from 2 to 3.  If they get
 499         *    processed on this CPU in reverse order, we'll see
 500         *     local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL.
 501         *    If we were to use __flush_tlb_one_user() and set local_tlb_gen to
 502         *    3, we'd be break the invariant: we'd update local_tlb_gen above
 503         *    1 without the full flush that's needed for tlb_gen 2.
 504         *
 505         * 2. f->new_tlb_gen == mm_tlb_gen.  This is purely an optimiation.
 506         *    Partial TLB flushes are not all that much cheaper than full TLB
 507         *    flushes, so it seems unlikely that it would be a performance win
 508         *    to do a partial flush if that won't bring our TLB fully up to
 509         *    date.  By doing a full flush instead, we can increase
 510         *    local_tlb_gen all the way to mm_tlb_gen and we can probably
 511         *    avoid another flush in the very near future.
 512         */
 513        if (f->end != TLB_FLUSH_ALL &&
 514            f->new_tlb_gen == local_tlb_gen + 1 &&
 515            f->new_tlb_gen == mm_tlb_gen) {
 516                /* Partial flush */
 517                unsigned long addr;
 518                unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
 519
 520                addr = f->start;
 521                while (addr < f->end) {
 522                        __flush_tlb_one_user(addr);
 523                        addr += PAGE_SIZE;
 524                }
 525                if (local)
 526                        count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
 527                trace_tlb_flush(reason, nr_pages);
 528        } else {
 529                /* Full flush. */
 530                local_flush_tlb();
 531                if (local)
 532                        count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
 533                trace_tlb_flush(reason, TLB_FLUSH_ALL);
 534        }
 535
 536        /* Both paths above update our state to mm_tlb_gen. */
 537        this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
 538}
 539
 540static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
 541{
 542        const struct flush_tlb_info *f = info;
 543
 544        flush_tlb_func_common(f, true, reason);
 545}
 546
 547static void flush_tlb_func_remote(void *info)
 548{
 549        const struct flush_tlb_info *f = info;
 550
 551        inc_irq_stat(irq_tlb_count);
 552
 553        if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm))
 554                return;
 555
 556        count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
 557        flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
 558}
 559
 560void native_flush_tlb_others(const struct cpumask *cpumask,
 561                             const struct flush_tlb_info *info)
 562{
 563        count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
 564        if (info->end == TLB_FLUSH_ALL)
 565                trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
 566        else
 567                trace_tlb_flush(TLB_REMOTE_SEND_IPI,
 568                                (info->end - info->start) >> PAGE_SHIFT);
 569
 570        if (is_uv_system()) {
 571                /*
 572                 * This whole special case is confused.  UV has a "Broadcast
 573                 * Assist Unit", which seems to be a fancy way to send IPIs.
 574                 * Back when x86 used an explicit TLB flush IPI, UV was
 575                 * optimized to use its own mechanism.  These days, x86 uses
 576                 * smp_call_function_many(), but UV still uses a manual IPI,
 577                 * and that IPI's action is out of date -- it does a manual
 578                 * flush instead of calling flush_tlb_func_remote().  This
 579                 * means that the percpu tlb_gen variables won't be updated
 580                 * and we'll do pointless flushes on future context switches.
 581                 *
 582                 * Rather than hooking native_flush_tlb_others() here, I think
 583                 * that UV should be updated so that smp_call_function_many(),
 584                 * etc, are optimal on UV.
 585                 */
 586                unsigned int cpu;
 587
 588                cpu = smp_processor_id();
 589                cpumask = uv_flush_tlb_others(cpumask, info);
 590                if (cpumask)
 591                        smp_call_function_many(cpumask, flush_tlb_func_remote,
 592                                               (void *)info, 1);
 593                return;
 594        }
 595        smp_call_function_many(cpumask, flush_tlb_func_remote,
 596                               (void *)info, 1);
 597}
 598
 599/*
 600 * See Documentation/x86/tlb.txt for details.  We choose 33
 601 * because it is large enough to cover the vast majority (at
 602 * least 95%) of allocations, and is small enough that we are
 603 * confident it will not cause too much overhead.  Each single
 604 * flush is about 100 ns, so this caps the maximum overhead at
 605 * _about_ 3,000 ns.
 606 *
 607 * This is in units of pages.
 608 */
 609static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
 610
 611void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 612                                unsigned long end, unsigned long vmflag)
 613{
 614        int cpu;
 615
 616        struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = {
 617                .mm = mm,
 618        };
 619
 620        cpu = get_cpu();
 621
 622        /* This is also a barrier that synchronizes with switch_mm(). */
 623        info.new_tlb_gen = inc_mm_tlb_gen(mm);
 624
 625        /* Should we flush just the requested range? */
 626        if ((end != TLB_FLUSH_ALL) &&
 627            !(vmflag & VM_HUGETLB) &&
 628            ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
 629                info.start = start;
 630                info.end = end;
 631        } else {
 632                info.start = 0UL;
 633                info.end = TLB_FLUSH_ALL;
 634        }
 635
 636        if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
 637                VM_WARN_ON(irqs_disabled());
 638                local_irq_disable();
 639                flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN);
 640                local_irq_enable();
 641        }
 642
 643        if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
 644                flush_tlb_others(mm_cpumask(mm), &info);
 645
 646        put_cpu();
 647}
 648
 649
 650static void do_flush_tlb_all(void *info)
 651{
 652        count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
 653        __flush_tlb_all();
 654}
 655
 656void flush_tlb_all(void)
 657{
 658        count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
 659        on_each_cpu(do_flush_tlb_all, NULL, 1);
 660}
 661
 662static void do_kernel_range_flush(void *info)
 663{
 664        struct flush_tlb_info *f = info;
 665        unsigned long addr;
 666
 667        /* flush range by one by one 'invlpg' */
 668        for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
 669                __flush_tlb_one_kernel(addr);
 670}
 671
 672void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 673{
 674
 675        /* Balance as user space task's flush, a bit conservative */
 676        if (end == TLB_FLUSH_ALL ||
 677            (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
 678                on_each_cpu(do_flush_tlb_all, NULL, 1);
 679        } else {
 680                struct flush_tlb_info info;
 681                info.start = start;
 682                info.end = end;
 683                on_each_cpu(do_kernel_range_flush, &info, 1);
 684        }
 685}
 686
 687void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 688{
 689        struct flush_tlb_info info = {
 690                .mm = NULL,
 691                .start = 0UL,
 692                .end = TLB_FLUSH_ALL,
 693        };
 694
 695        int cpu = get_cpu();
 696
 697        if (cpumask_test_cpu(cpu, &batch->cpumask)) {
 698                VM_WARN_ON(irqs_disabled());
 699                local_irq_disable();
 700                flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN);
 701                local_irq_enable();
 702        }
 703
 704        if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
 705                flush_tlb_others(&batch->cpumask, &info);
 706
 707        cpumask_clear(&batch->cpumask);
 708
 709        put_cpu();
 710}
 711
 712static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
 713                             size_t count, loff_t *ppos)
 714{
 715        char buf[32];
 716        unsigned int len;
 717
 718        len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling);
 719        return simple_read_from_buffer(user_buf, count, ppos, buf, len);
 720}
 721
 722static ssize_t tlbflush_write_file(struct file *file,
 723                 const char __user *user_buf, size_t count, loff_t *ppos)
 724{
 725        char buf[32];
 726        ssize_t len;
 727        int ceiling;
 728
 729        len = min(count, sizeof(buf) - 1);
 730        if (copy_from_user(buf, user_buf, len))
 731                return -EFAULT;
 732
 733        buf[len] = '\0';
 734        if (kstrtoint(buf, 0, &ceiling))
 735                return -EINVAL;
 736
 737        if (ceiling < 0)
 738                return -EINVAL;
 739
 740        tlb_single_page_flush_ceiling = ceiling;
 741        return count;
 742}
 743
 744static const struct file_operations fops_tlbflush = {
 745        .read = tlbflush_read_file,
 746        .write = tlbflush_write_file,
 747        .llseek = default_llseek,
 748};
 749
 750static int __init create_tlb_single_page_flush_ceiling(void)
 751{
 752        debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR,
 753                            arch_debugfs_dir, NULL, &fops_tlbflush);
 754        return 0;
 755}
 756late_initcall(create_tlb_single_page_flush_ceiling);
 757