linux/include/linux/hmm.h
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0-or-later */
   2/*
   3 * Copyright 2013 Red Hat Inc.
   4 *
   5 * Authors: Jérôme Glisse <jglisse@redhat.com>
   6 */
   7/*
   8 * Heterogeneous Memory Management (HMM)
   9 *
  10 * See Documentation/vm/hmm.rst for reasons and overview of what HMM is and it
  11 * is for. Here we focus on the HMM API description, with some explanation of
  12 * the underlying implementation.
  13 *
  14 * Short description: HMM provides a set of helpers to share a virtual address
  15 * space between CPU and a device, so that the device can access any valid
  16 * address of the process (while still obeying memory protection). HMM also
  17 * provides helpers to migrate process memory to device memory, and back. Each
  18 * set of functionality (address space mirroring, and migration to and from
  19 * device memory) can be used independently of the other.
  20 *
  21 *
  22 * HMM address space mirroring API:
  23 *
  24 * Use HMM address space mirroring if you want to mirror a range of the CPU
  25 * page tables of a process into a device page table. Here, "mirror" means "keep
  26 * synchronized". Prerequisites: the device must provide the ability to write-
  27 * protect its page tables (at PAGE_SIZE granularity), and must be able to
  28 * recover from the resulting potential page faults.
  29 *
  30 * HMM guarantees that at any point in time, a given virtual address points to
  31 * either the same memory in both CPU and device page tables (that is: CPU and
  32 * device page tables each point to the same pages), or that one page table (CPU
  33 * or device) points to no entry, while the other still points to the old page
  34 * for the address. The latter case happens when the CPU page table update
  35 * happens first, and then the update is mirrored over to the device page table.
  36 * This does not cause any issue, because the CPU page table cannot start
  37 * pointing to a new page until the device page table is invalidated.
  38 *
  39 * HMM uses mmu_notifiers to monitor the CPU page tables, and forwards any
  40 * updates to each device driver that has registered a mirror. It also provides
  41 * some API calls to help with taking a snapshot of the CPU page table, and to
  42 * synchronize with any updates that might happen concurrently.
  43 *
  44 *
  45 * HMM migration to and from device memory:
  46 *
  47 * HMM provides a set of helpers to hotplug device memory as ZONE_DEVICE, with
  48 * a new MEMORY_DEVICE_PRIVATE type. This provides a struct page for each page
  49 * of the device memory, and allows the device driver to manage its memory
  50 * using those struct pages. Having struct pages for device memory makes
  51 * migration easier. Because that memory is not addressable by the CPU it must
  52 * never be pinned to the device; in other words, any CPU page fault can always
  53 * cause the device memory to be migrated (copied/moved) back to regular memory.
  54 *
  55 * A new migrate helper (migrate_vma()) has been added (see mm/migrate.c) that
  56 * allows use of a device DMA engine to perform the copy operation between
  57 * regular system memory and device memory.
  58 */
  59#ifndef LINUX_HMM_H
  60#define LINUX_HMM_H
  61
  62#include <linux/kconfig.h>
  63#include <asm/pgtable.h>
  64
  65#ifdef CONFIG_HMM_MIRROR
  66
  67#include <linux/device.h>
  68#include <linux/migrate.h>
  69#include <linux/memremap.h>
  70#include <linux/completion.h>
  71#include <linux/mmu_notifier.h>
  72
  73
  74/*
  75 * struct hmm - HMM per mm struct
  76 *
  77 * @mm: mm struct this HMM struct is bound to
  78 * @lock: lock protecting ranges list
  79 * @ranges: list of range being snapshotted
  80 * @mirrors: list of mirrors for this mm
  81 * @mmu_notifier: mmu notifier to track updates to CPU page table
  82 * @mirrors_sem: read/write semaphore protecting the mirrors list
  83 * @wq: wait queue for user waiting on a range invalidation
  84 * @notifiers: count of active mmu notifiers
  85 */
  86struct hmm {
  87        struct mm_struct        *mm;
  88        struct kref             kref;
  89        spinlock_t              ranges_lock;
  90        struct list_head        ranges;
  91        struct list_head        mirrors;
  92        struct mmu_notifier     mmu_notifier;
  93        struct rw_semaphore     mirrors_sem;
  94        wait_queue_head_t       wq;
  95        struct rcu_head         rcu;
  96        long                    notifiers;
  97};
  98
  99/*
 100 * hmm_pfn_flag_e - HMM flag enums
 101 *
 102 * Flags:
 103 * HMM_PFN_VALID: pfn is valid. It has, at least, read permission.
 104 * HMM_PFN_WRITE: CPU page table has write permission set
 105 * HMM_PFN_DEVICE_PRIVATE: private device memory (ZONE_DEVICE)
 106 *
 107 * The driver provides a flags array for mapping page protections to device
 108 * PTE bits. If the driver valid bit for an entry is bit 3,
 109 * i.e., (entry & (1 << 3)), then the driver must provide
 110 * an array in hmm_range.flags with hmm_range.flags[HMM_PFN_VALID] == 1 << 3.
 111 * Same logic apply to all flags. This is the same idea as vm_page_prot in vma
 112 * except that this is per device driver rather than per architecture.
 113 */
 114enum hmm_pfn_flag_e {
 115        HMM_PFN_VALID = 0,
 116        HMM_PFN_WRITE,
 117        HMM_PFN_DEVICE_PRIVATE,
 118        HMM_PFN_FLAG_MAX
 119};
 120
 121/*
 122 * hmm_pfn_value_e - HMM pfn special value
 123 *
 124 * Flags:
 125 * HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory
 126 * HMM_PFN_NONE: corresponding CPU page table entry is pte_none()
 127 * HMM_PFN_SPECIAL: corresponding CPU page table entry is special; i.e., the
 128 *      result of vmf_insert_pfn() or vm_insert_page(). Therefore, it should not
 129 *      be mirrored by a device, because the entry will never have HMM_PFN_VALID
 130 *      set and the pfn value is undefined.
 131 *
 132 * Driver provides values for none entry, error entry, and special entry.
 133 * Driver can alias (i.e., use same value) error and special, but
 134 * it should not alias none with error or special.
 135 *
 136 * HMM pfn value returned by hmm_vma_get_pfns() or hmm_vma_fault() will be:
 137 * hmm_range.values[HMM_PFN_ERROR] if CPU page table entry is poisonous,
 138 * hmm_range.values[HMM_PFN_NONE] if there is no CPU page table entry,
 139 * hmm_range.values[HMM_PFN_SPECIAL] if CPU page table entry is a special one
 140 */
 141enum hmm_pfn_value_e {
 142        HMM_PFN_ERROR,
 143        HMM_PFN_NONE,
 144        HMM_PFN_SPECIAL,
 145        HMM_PFN_VALUE_MAX
 146};
 147
 148/*
 149 * struct hmm_range - track invalidation lock on virtual address range
 150 *
 151 * @hmm: the core HMM structure this range is active against
 152 * @vma: the vm area struct for the range
 153 * @list: all range lock are on a list
 154 * @start: range virtual start address (inclusive)
 155 * @end: range virtual end address (exclusive)
 156 * @pfns: array of pfns (big enough for the range)
 157 * @flags: pfn flags to match device driver page table
 158 * @values: pfn value for some special case (none, special, error, ...)
 159 * @default_flags: default flags for the range (write, read, ... see hmm doc)
 160 * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter
 161 * @page_shift: device virtual address shift value (should be >= PAGE_SHIFT)
 162 * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT)
 163 * @valid: pfns array did not change since it has been fill by an HMM function
 164 */
 165struct hmm_range {
 166        struct hmm              *hmm;
 167        struct vm_area_struct   *vma;
 168        struct list_head        list;
 169        unsigned long           start;
 170        unsigned long           end;
 171        uint64_t                *pfns;
 172        const uint64_t          *flags;
 173        const uint64_t          *values;
 174        uint64_t                default_flags;
 175        uint64_t                pfn_flags_mask;
 176        uint8_t                 page_shift;
 177        uint8_t                 pfn_shift;
 178        bool                    valid;
 179};
 180
 181/*
 182 * hmm_range_page_shift() - return the page shift for the range
 183 * @range: range being queried
 184 * Return: page shift (page size = 1 << page shift) for the range
 185 */
 186static inline unsigned hmm_range_page_shift(const struct hmm_range *range)
 187{
 188        return range->page_shift;
 189}
 190
 191/*
 192 * hmm_range_page_size() - return the page size for the range
 193 * @range: range being queried
 194 * Return: page size for the range in bytes
 195 */
 196static inline unsigned long hmm_range_page_size(const struct hmm_range *range)
 197{
 198        return 1UL << hmm_range_page_shift(range);
 199}
 200
 201/*
 202 * hmm_range_wait_until_valid() - wait for range to be valid
 203 * @range: range affected by invalidation to wait on
 204 * @timeout: time out for wait in ms (ie abort wait after that period of time)
 205 * Return: true if the range is valid, false otherwise.
 206 */
 207static inline bool hmm_range_wait_until_valid(struct hmm_range *range,
 208                                              unsigned long timeout)
 209{
 210        return wait_event_timeout(range->hmm->wq, range->valid,
 211                                  msecs_to_jiffies(timeout)) != 0;
 212}
 213
 214/*
 215 * hmm_range_valid() - test if a range is valid or not
 216 * @range: range
 217 * Return: true if the range is valid, false otherwise.
 218 */
 219static inline bool hmm_range_valid(struct hmm_range *range)
 220{
 221        return range->valid;
 222}
 223
 224/*
 225 * hmm_device_entry_to_page() - return struct page pointed to by a device entry
 226 * @range: range use to decode device entry value
 227 * @entry: device entry value to get corresponding struct page from
 228 * Return: struct page pointer if entry is a valid, NULL otherwise
 229 *
 230 * If the device entry is valid (ie valid flag set) then return the struct page
 231 * matching the entry value. Otherwise return NULL.
 232 */
 233static inline struct page *hmm_device_entry_to_page(const struct hmm_range *range,
 234                                                    uint64_t entry)
 235{
 236        if (entry == range->values[HMM_PFN_NONE])
 237                return NULL;
 238        if (entry == range->values[HMM_PFN_ERROR])
 239                return NULL;
 240        if (entry == range->values[HMM_PFN_SPECIAL])
 241                return NULL;
 242        if (!(entry & range->flags[HMM_PFN_VALID]))
 243                return NULL;
 244        return pfn_to_page(entry >> range->pfn_shift);
 245}
 246
 247/*
 248 * hmm_device_entry_to_pfn() - return pfn value store in a device entry
 249 * @range: range use to decode device entry value
 250 * @entry: device entry to extract pfn from
 251 * Return: pfn value if device entry is valid, -1UL otherwise
 252 */
 253static inline unsigned long
 254hmm_device_entry_to_pfn(const struct hmm_range *range, uint64_t pfn)
 255{
 256        if (pfn == range->values[HMM_PFN_NONE])
 257                return -1UL;
 258        if (pfn == range->values[HMM_PFN_ERROR])
 259                return -1UL;
 260        if (pfn == range->values[HMM_PFN_SPECIAL])
 261                return -1UL;
 262        if (!(pfn & range->flags[HMM_PFN_VALID]))
 263                return -1UL;
 264        return (pfn >> range->pfn_shift);
 265}
 266
 267/*
 268 * hmm_device_entry_from_page() - create a valid device entry for a page
 269 * @range: range use to encode HMM pfn value
 270 * @page: page for which to create the device entry
 271 * Return: valid device entry for the page
 272 */
 273static inline uint64_t hmm_device_entry_from_page(const struct hmm_range *range,
 274                                                  struct page *page)
 275{
 276        return (page_to_pfn(page) << range->pfn_shift) |
 277                range->flags[HMM_PFN_VALID];
 278}
 279
 280/*
 281 * hmm_device_entry_from_pfn() - create a valid device entry value from pfn
 282 * @range: range use to encode HMM pfn value
 283 * @pfn: pfn value for which to create the device entry
 284 * Return: valid device entry for the pfn
 285 */
 286static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range,
 287                                                 unsigned long pfn)
 288{
 289        return (pfn << range->pfn_shift) |
 290                range->flags[HMM_PFN_VALID];
 291}
 292
 293/*
 294 * Old API:
 295 * hmm_pfn_to_page()
 296 * hmm_pfn_to_pfn()
 297 * hmm_pfn_from_page()
 298 * hmm_pfn_from_pfn()
 299 *
 300 * This are the OLD API please use new API, it is here to avoid cross-tree
 301 * merge painfullness ie we convert things to new API in stages.
 302 */
 303static inline struct page *hmm_pfn_to_page(const struct hmm_range *range,
 304                                           uint64_t pfn)
 305{
 306        return hmm_device_entry_to_page(range, pfn);
 307}
 308
 309static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range,
 310                                           uint64_t pfn)
 311{
 312        return hmm_device_entry_to_pfn(range, pfn);
 313}
 314
 315static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range,
 316                                         struct page *page)
 317{
 318        return hmm_device_entry_from_page(range, page);
 319}
 320
 321static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range,
 322                                        unsigned long pfn)
 323{
 324        return hmm_device_entry_from_pfn(range, pfn);
 325}
 326
 327/*
 328 * Mirroring: how to synchronize device page table with CPU page table.
 329 *
 330 * A device driver that is participating in HMM mirroring must always
 331 * synchronize with CPU page table updates. For this, device drivers can either
 332 * directly use mmu_notifier APIs or they can use the hmm_mirror API. Device
 333 * drivers can decide to register one mirror per device per process, or just
 334 * one mirror per process for a group of devices. The pattern is:
 335 *
 336 *      int device_bind_address_space(..., struct mm_struct *mm, ...)
 337 *      {
 338 *          struct device_address_space *das;
 339 *
 340 *          // Device driver specific initialization, and allocation of das
 341 *          // which contains an hmm_mirror struct as one of its fields.
 342 *          ...
 343 *
 344 *          ret = hmm_mirror_register(&das->mirror, mm, &device_mirror_ops);
 345 *          if (ret) {
 346 *              // Cleanup on error
 347 *              return ret;
 348 *          }
 349 *
 350 *          // Other device driver specific initialization
 351 *          ...
 352 *      }
 353 *
 354 * Once an hmm_mirror is registered for an address space, the device driver
 355 * will get callbacks through sync_cpu_device_pagetables() operation (see
 356 * hmm_mirror_ops struct).
 357 *
 358 * Device driver must not free the struct containing the hmm_mirror struct
 359 * before calling hmm_mirror_unregister(). The expected usage is to do that when
 360 * the device driver is unbinding from an address space.
 361 *
 362 *
 363 *      void device_unbind_address_space(struct device_address_space *das)
 364 *      {
 365 *          // Device driver specific cleanup
 366 *          ...
 367 *
 368 *          hmm_mirror_unregister(&das->mirror);
 369 *
 370 *          // Other device driver specific cleanup, and now das can be freed
 371 *          ...
 372 *      }
 373 */
 374
 375struct hmm_mirror;
 376
 377/*
 378 * enum hmm_update_event - type of update
 379 * @HMM_UPDATE_INVALIDATE: invalidate range (no indication as to why)
 380 */
 381enum hmm_update_event {
 382        HMM_UPDATE_INVALIDATE,
 383};
 384
 385/*
 386 * struct hmm_update - HMM update information for callback
 387 *
 388 * @start: virtual start address of the range to update
 389 * @end: virtual end address of the range to update
 390 * @event: event triggering the update (what is happening)
 391 * @blockable: can the callback block/sleep ?
 392 */
 393struct hmm_update {
 394        unsigned long start;
 395        unsigned long end;
 396        enum hmm_update_event event;
 397        bool blockable;
 398};
 399
 400/*
 401 * struct hmm_mirror_ops - HMM mirror device operations callback
 402 *
 403 * @update: callback to update range on a device
 404 */
 405struct hmm_mirror_ops {
 406        /* release() - release hmm_mirror
 407         *
 408         * @mirror: pointer to struct hmm_mirror
 409         *
 410         * This is called when the mm_struct is being released.  The callback
 411         * must ensure that all access to any pages obtained from this mirror
 412         * is halted before the callback returns. All future access should
 413         * fault.
 414         */
 415        void (*release)(struct hmm_mirror *mirror);
 416
 417        /* sync_cpu_device_pagetables() - synchronize page tables
 418         *
 419         * @mirror: pointer to struct hmm_mirror
 420         * @update: update information (see struct hmm_update)
 421         * Return: -EAGAIN if update.blockable false and callback need to
 422         *          block, 0 otherwise.
 423         *
 424         * This callback ultimately originates from mmu_notifiers when the CPU
 425         * page table is updated. The device driver must update its page table
 426         * in response to this callback. The update argument tells what action
 427         * to perform.
 428         *
 429         * The device driver must not return from this callback until the device
 430         * page tables are completely updated (TLBs flushed, etc); this is a
 431         * synchronous call.
 432         */
 433        int (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror,
 434                                          const struct hmm_update *update);
 435};
 436
 437/*
 438 * struct hmm_mirror - mirror struct for a device driver
 439 *
 440 * @hmm: pointer to struct hmm (which is unique per mm_struct)
 441 * @ops: device driver callback for HMM mirror operations
 442 * @list: for list of mirrors of a given mm
 443 *
 444 * Each address space (mm_struct) being mirrored by a device must register one
 445 * instance of an hmm_mirror struct with HMM. HMM will track the list of all
 446 * mirrors for each mm_struct.
 447 */
 448struct hmm_mirror {
 449        struct hmm                      *hmm;
 450        const struct hmm_mirror_ops     *ops;
 451        struct list_head                list;
 452};
 453
 454int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm);
 455void hmm_mirror_unregister(struct hmm_mirror *mirror);
 456
 457/*
 458 * Please see Documentation/vm/hmm.rst for how to use the range API.
 459 */
 460int hmm_range_register(struct hmm_range *range,
 461                       struct hmm_mirror *mirror,
 462                       unsigned long start,
 463                       unsigned long end,
 464                       unsigned page_shift);
 465void hmm_range_unregister(struct hmm_range *range);
 466long hmm_range_snapshot(struct hmm_range *range);
 467long hmm_range_fault(struct hmm_range *range, bool block);
 468long hmm_range_dma_map(struct hmm_range *range,
 469                       struct device *device,
 470                       dma_addr_t *daddrs,
 471                       bool block);
 472long hmm_range_dma_unmap(struct hmm_range *range,
 473                         struct vm_area_struct *vma,
 474                         struct device *device,
 475                         dma_addr_t *daddrs,
 476                         bool dirty);
 477
 478/*
 479 * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range
 480 *
 481 * When waiting for mmu notifiers we need some kind of time out otherwise we
 482 * could potentialy wait for ever, 1000ms ie 1s sounds like a long time to
 483 * wait already.
 484 */
 485#define HMM_RANGE_DEFAULT_TIMEOUT 1000
 486
 487/* Below are for HMM internal use only! Not to be used by device driver! */
 488static inline void hmm_mm_init(struct mm_struct *mm)
 489{
 490        mm->hmm = NULL;
 491}
 492#else /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 493static inline void hmm_mm_init(struct mm_struct *mm) {}
 494#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 495
 496#endif /* LINUX_HMM_H */
 497