LXR linux/drivers/misc/sgi-gru/grutlbpurge.c

   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * SN Platform GRU Driver
   4 *
   5 *              MMUOPS callbacks  + TLB flushing
   6 *
   7 * This file handles emu notifier callbacks from the core kernel. The callbacks
   8 * are used to update the TLB in the GRU as a result of changes in the
   9 * state of a process address space. This file also handles TLB invalidates
  10 * from the GRU driver.
  11 *
  12 *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
  13 */
  14
  15#include <linux/kernel.h>
  16#include <linux/list.h>
  17#include <linux/spinlock.h>
  18#include <linux/mm.h>
  19#include <linux/slab.h>
  20#include <linux/device.h>
  21#include <linux/hugetlb.h>
  22#include <linux/delay.h>
  23#include <linux/timex.h>
  24#include <linux/srcu.h>
  25#include <asm/processor.h>
  26#include "gru.h"
  27#include "grutables.h"
  28#include <asm/uv/uv_hub.h>
  29
  30#define gru_random()    get_cycles()
  31
  32/* ---------------------------------- TLB Invalidation functions --------
  33 * get_tgh_handle
  34 *
  35 * Find a TGH to use for issuing a TLB invalidate. For GRUs that are on the
  36 * local blade, use a fixed TGH that is a function of the blade-local cpu
  37 * number. Normally, this TGH is private to the cpu & no contention occurs for
  38 * the TGH. For offblade GRUs, select a random TGH in the range above the
  39 * private TGHs. A spinlock is required to access this TGH & the lock must be
  40 * released when the invalidate is completes. This sucks, but it is the best we
  41 * can do.
  42 *
  43 * Note that the spinlock is IN the TGH handle so locking does not involve
  44 * additional cache lines.
  45 *
  46 */
  47static inline int get_off_blade_tgh(struct gru_state *gru)
  48{
  49        int n;
  50
  51        n = GRU_NUM_TGH - gru->gs_tgh_first_remote;
  52        n = gru_random() % n;
  53        n += gru->gs_tgh_first_remote;
  54        return n;
  55}
  56
  57static inline int get_on_blade_tgh(struct gru_state *gru)
  58{
  59        return uv_blade_processor_id() >> gru->gs_tgh_local_shift;
  60}
  61
  62static struct gru_tlb_global_handle *get_lock_tgh_handle(struct gru_state
  63                                                         *gru)
  64{
  65        struct gru_tlb_global_handle *tgh;
  66        int n;
  67
  68        preempt_disable();
  69        if (uv_numa_blade_id() == gru->gs_blade_id)
  70                n = get_on_blade_tgh(gru);
  71        else
  72                n = get_off_blade_tgh(gru);
  73        tgh = get_tgh_by_index(gru, n);
  74        lock_tgh_handle(tgh);
  75
  76        return tgh;
  77}
  78
  79static void get_unlock_tgh_handle(struct gru_tlb_global_handle *tgh)
  80{
  81        unlock_tgh_handle(tgh);
  82        preempt_enable();
  83}
  84
  85/*
  86 * gru_flush_tlb_range
  87 *
  88 * General purpose TLB invalidation function. This function scans every GRU in
  89 * the ENTIRE system (partition) looking for GRUs where the specified MM has
  90 * been accessed by the GRU. For each GRU found, the TLB must be invalidated OR
  91 * the ASID invalidated. Invalidating an ASID causes a new ASID to be assigned
  92 * on the next fault. This effectively flushes the ENTIRE TLB for the MM at the
  93 * cost of (possibly) a large number of future TLBmisses.
  94 *
  95 * The current algorithm is optimized based on the following (somewhat true)
  96 * assumptions:
  97 *      - GRU contexts are not loaded into a GRU unless a reference is made to
  98 *        the data segment or control block (this is true, not an assumption).
  99 *        If a DS/CB is referenced, the user will also issue instructions that
 100 *        cause TLBmisses. It is not necessary to optimize for the case where
 101 *        contexts are loaded but no instructions cause TLB misses. (I know
 102 *        this will happen but I'm not optimizing for it).
 103 *      - GRU instructions to invalidate TLB entries are SLOOOOWWW - normally
 104 *        a few usec but in unusual cases, it could be longer. Avoid if
 105 *        possible.
 106 *      - intrablade process migration between cpus is not frequent but is
 107 *        common.
 108 *      - a GRU context is not typically migrated to a different GRU on the
 109 *        blade because of intrablade migration
 110 *      - interblade migration is rare. Processes migrate their GRU context to
 111 *        the new blade.
 112 *      - if interblade migration occurs, migration back to the original blade
 113 *        is very very rare (ie., no optimization for this case)
 114 *      - most GRU instruction operate on a subset of the user REGIONS. Code
 115 *        & shared library regions are not likely targets of GRU instructions.
 116 *
 117 * To help improve the efficiency of TLB invalidation, the GMS data
 118 * structure is maintained for EACH address space (MM struct). The GMS is
 119 * also the structure that contains the pointer to the mmu callout
 120 * functions. This structure is linked to the mm_struct for the address space
 121 * using the mmu "register" function. The mmu interfaces are used to
 122 * provide the callbacks for TLB invalidation. The GMS contains:
 123 *
 124 *      - asid[maxgrus] array. ASIDs are assigned to a GRU when a context is
 125 *        loaded into the GRU.
 126 *      - asidmap[maxgrus]. bitmap to make it easier to find non-zero asids in
 127 *        the above array
 128 *      - ctxbitmap[maxgrus]. Indicates the contexts that are currently active
 129 *        in the GRU for the address space. This bitmap must be passed to the
 130 *        GRU to do an invalidate.
 131 *
 132 * The current algorithm for invalidating TLBs is:
 133 *      - scan the asidmap for GRUs where the context has been loaded, ie,
 134 *        asid is non-zero.
 135 *      - for each gru found:
 136 *              - if the ctxtmap is non-zero, there are active contexts in the
 137 *                GRU. TLB invalidate instructions must be issued to the GRU.
 138 *              - if the ctxtmap is zero, no context is active. Set the ASID to
 139 *                zero to force a full TLB invalidation. This is fast but will
 140 *                cause a lot of TLB misses if the context is reloaded onto the
 141 *                GRU
 142 *
 143 */
 144
 145void gru_flush_tlb_range(struct gru_mm_struct *gms, unsigned long start,
 146                         unsigned long len)
 147{
 148        struct gru_state *gru;
 149        struct gru_mm_tracker *asids;
 150        struct gru_tlb_global_handle *tgh;
 151        unsigned long num;
 152        int grupagesize, pagesize, pageshift, gid, asid;
 153
 154        /* ZZZ TODO - handle huge pages */
 155        pageshift = PAGE_SHIFT;
 156        pagesize = (1UL << pageshift);
 157        grupagesize = GRU_PAGESIZE(pageshift);
 158        num = min(((len + pagesize - 1) >> pageshift), GRUMAXINVAL);
 159
 160        STAT(flush_tlb);
 161        gru_dbg(grudev, "gms %p, start 0x%lx, len 0x%lx, asidmap 0x%lx\n", gms,
 162                start, len, gms->ms_asidmap[0]);
 163
 164        spin_lock(&gms->ms_asid_lock);
 165        for_each_gru_in_bitmap(gid, gms->ms_asidmap) {
 166                STAT(flush_tlb_gru);
 167                gru = GID_TO_GRU(gid);
 168                asids = gms->ms_asids + gid;
 169                asid = asids->mt_asid;
 170                if (asids->mt_ctxbitmap && asid) {
 171                        STAT(flush_tlb_gru_tgh);
 172                        asid = GRUASID(asid, start);
 173                        gru_dbg(grudev,
 174        "  FLUSH gruid %d, asid 0x%x, vaddr 0x%lx, vamask 0x%x, num %ld, cbmap 0x%x\n",
 175                              gid, asid, start, grupagesize, num, asids->mt_ctxbitmap);
 176                        tgh = get_lock_tgh_handle(gru);
 177                        tgh_invalidate(tgh, start, ~0, asid, grupagesize, 0,
 178                                       num - 1, asids->mt_ctxbitmap);
 179                        get_unlock_tgh_handle(tgh);
 180                } else {
 181                        STAT(flush_tlb_gru_zero_asid);
 182                        asids->mt_asid = 0;
 183                        __clear_bit(gru->gs_gid, gms->ms_asidmap);
 184                        gru_dbg(grudev,
 185        "  CLEARASID gruid %d, asid 0x%x, cbtmap 0x%x, asidmap 0x%lx\n",
 186                                gid, asid, asids->mt_ctxbitmap,
 187                                gms->ms_asidmap[0]);
 188                }
 189        }
 190        spin_unlock(&gms->ms_asid_lock);
 191}
 192
 193/*
 194 * Flush the entire TLB on a chiplet.
 195 */
 196void gru_flush_all_tlb(struct gru_state *gru)
 197{
 198        struct gru_tlb_global_handle *tgh;
 199
 200        gru_dbg(grudev, "gid %d\n", gru->gs_gid);
 201        tgh = get_lock_tgh_handle(gru);
 202        tgh_invalidate(tgh, 0, ~0, 0, 1, 1, GRUMAXINVAL - 1, 0xffff);
 203        get_unlock_tgh_handle(tgh);
 204}
 205
 206/*
 207 * MMUOPS notifier callout functions
 208 */
 209static int gru_invalidate_range_start(struct mmu_notifier *mn,
 210                        const struct mmu_notifier_range *range)
 211{
 212        struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct,
 213                                                 ms_notifier);
 214
 215        STAT(mmu_invalidate_range);
 216        atomic_inc(&gms->ms_range_active);
 217        gru_dbg(grudev, "gms %p, start 0x%lx, end 0x%lx, act %d\n", gms,
 218                range->start, range->end, atomic_read(&gms->ms_range_active));
 219        gru_flush_tlb_range(gms, range->start, range->end - range->start);
 220
 221        return 0;
 222}
 223
 224static void gru_invalidate_range_end(struct mmu_notifier *mn,
 225                        const struct mmu_notifier_range *range)
 226{
 227        struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct,
 228                                                 ms_notifier);
 229
 230        /* ..._and_test() provides needed barrier */
 231        (void)atomic_dec_and_test(&gms->ms_range_active);
 232
 233        wake_up_all(&gms->ms_wait_queue);
 234        gru_dbg(grudev, "gms %p, start 0x%lx, end 0x%lx\n",
 235                gms, range->start, range->end);
 236}
 237
 238static struct mmu_notifier *gru_alloc_notifier(struct mm_struct *mm)
 239{
 240        struct gru_mm_struct *gms;
 241
 242        gms = kzalloc(sizeof(*gms), GFP_KERNEL);
 243        if (!gms)
 244                return ERR_PTR(-ENOMEM);
 245        STAT(gms_alloc);
 246        spin_lock_init(&gms->ms_asid_lock);
 247        init_waitqueue_head(&gms->ms_wait_queue);
 248
 249        return &gms->ms_notifier;
 250}
 251
 252static void gru_free_notifier(struct mmu_notifier *mn)
 253{
 254        kfree(container_of(mn, struct gru_mm_struct, ms_notifier));
 255        STAT(gms_free);
 256}
 257
 258static const struct mmu_notifier_ops gru_mmuops = {
 259        .invalidate_range_start = gru_invalidate_range_start,
 260        .invalidate_range_end   = gru_invalidate_range_end,
 261        .alloc_notifier         = gru_alloc_notifier,
 262        .free_notifier          = gru_free_notifier,
 263};
 264
 265struct gru_mm_struct *gru_register_mmu_notifier(void)
 266{
 267        struct mmu_notifier *mn;
 268
 269        mn = mmu_notifier_get_locked(&gru_mmuops, current->mm);
 270        if (IS_ERR(mn))
 271                return ERR_CAST(mn);
 272
 273        return container_of(mn, struct gru_mm_struct, ms_notifier);
 274}
 275
 276void gru_drop_mmu_notifier(struct gru_mm_struct *gms)
 277{
 278        mmu_notifier_put(&gms->ms_notifier);
 279}
 280
 281/*
 282 * Setup TGH parameters. There are:
 283 *      - 24 TGH handles per GRU chiplet
 284 *      - a portion (MAX_LOCAL_TGH) of the handles are reserved for
 285 *        use by blade-local cpus
 286 *      - the rest are used by off-blade cpus. This usage is
 287 *        less frequent than blade-local usage.
 288 *
 289 * For now, use 16 handles for local flushes, 8 for remote flushes. If the blade
 290 * has less tan or equal to 16 cpus, each cpu has a unique handle that it can
 291 * use.
 292 */
 293#define MAX_LOCAL_TGH   16
 294
 295void gru_tgh_flush_init(struct gru_state *gru)
 296{
 297        int cpus, shift = 0, n;
 298
 299        cpus = uv_blade_nr_possible_cpus(gru->gs_blade_id);
 300
 301        /* n = cpus rounded up to next power of 2 */
 302        if (cpus) {
 303                n = 1 << fls(cpus - 1);
 304
 305                /*
 306                 * shift count for converting local cpu# to TGH index
 307                 *      0 if cpus <= MAX_LOCAL_TGH,
 308                 *      1 if cpus <= 2*MAX_LOCAL_TGH,
 309                 *      etc
 310                 */
 311                shift = max(0, fls(n - 1) - fls(MAX_LOCAL_TGH - 1));
 312        }
 313        gru->gs_tgh_local_shift = shift;
 314
 315        /* first starting TGH index to use for remote purges */
 316        gru->gs_tgh_first_remote = (cpus + (1 << shift) - 1) >> shift;
 317
 318}
 319