1/* SPDX-License-Identifier: GPL-2.0 */ 2#ifndef _LINUX_MM_TYPES_H 3#define _LINUX_MM_TYPES_H 4 5#include <linux/mm_types_task.h> 6 7#include <linux/auxvec.h> 8#include <linux/list.h> 9#include <linux/spinlock.h> 10#include <linux/rbtree.h> 11#include <linux/rwsem.h> 12#include <linux/completion.h> 13#include <linux/cpumask.h> 14#include <linux/uprobes.h> 15#include <linux/page-flags-layout.h> 16#include <linux/workqueue.h> 17 18#include <asm/mmu.h> 19 20#ifndef AT_VECTOR_SIZE_ARCH 21#define AT_VECTOR_SIZE_ARCH 0 22#endif 23#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) 24 25struct address_space; 26struct mem_cgroup; 27struct hmm; 28 29/* 30 * Each physical page in the system has a struct page associated with 31 * it to keep track of whatever it is we are using the page for at the 32 * moment. Note that we have no way to track which tasks are using 33 * a page, though if it is a pagecache page, rmap structures can tell us 34 * who is mapping it. 35 * 36 * The objects in struct page are organized in double word blocks in 37 * order to allows us to use atomic double word operations on portions 38 * of struct page. That is currently only used by slub but the arrangement 39 * allows the use of atomic double word operations on the flags/mapping 40 * and lru list pointers also. 41 */ 42struct page { 43 /* First double word block */ 44 unsigned long flags; /* Atomic flags, some possibly 45 * updated asynchronously */ 46 union { 47 struct address_space *mapping; /* If low bit clear, points to 48 * inode address_space, or NULL. 49 * If page mapped as anonymous 50 * memory, low bit is set, and 51 * it points to anon_vma object: 52 * see PAGE_MAPPING_ANON below. 53 */ 54 void *s_mem; /* slab first object */ 55 atomic_t compound_mapcount; /* first tail page */ 56 /* page_deferred_list().next -- second tail page */ 57 }; 58 59 /* Second double word */ 60 union { 61 pgoff_t index; /* Our offset within mapping. */ 62 void *freelist; /* sl[aou]b first free object */ 63 /* page_deferred_list().prev -- second tail page */ 64 }; 65 66 union { 67#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ 68 defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) 69 /* Used for cmpxchg_double in slub */ 70 unsigned long counters; 71#else 72 /* 73 * Keep _refcount separate from slub cmpxchg_double data. 74 * As the rest of the double word is protected by slab_lock 75 * but _refcount is not. 76 */ 77 unsigned counters; 78#endif 79 struct { 80 81 union { 82 /* 83 * Count of ptes mapped in mms, to show when 84 * page is mapped & limit reverse map searches. 85 * 86 * Extra information about page type may be 87 * stored here for pages that are never mapped, 88 * in which case the value MUST BE <= -2. 89 * See page-flags.h for more details. 90 */ 91 atomic_t _mapcount; 92 93 unsigned int active; /* SLAB */ 94 struct { /* SLUB */ 95 unsigned inuse:16; 96 unsigned objects:15; 97 unsigned frozen:1; 98 }; 99 int units; /* SLOB */ 100 }; 101 /* 102 * Usage count, *USE WRAPPER FUNCTION* when manual 103 * accounting. See page_ref.h 104 */ 105 atomic_t _refcount; 106 }; 107 }; 108 109 /* 110 * Third double word block 111 * 112 * WARNING: bit 0 of the first word encode PageTail(). That means 113 * the rest users of the storage space MUST NOT use the bit to 114 * avoid collision and false-positive PageTail(). 115 */ 116 union { 117 struct list_head lru; /* Pageout list, eg. active_list 118 * protected by zone_lru_lock ! 119 * Can be used as a generic list 120 * by the page owner. 121 */ 122 struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an 123 * lru or handled by a slab 124 * allocator, this points to the 125 * hosting device page map. 126 */ 127 struct { /* slub per cpu partial pages */ 128 struct page *next; /* Next partial slab */ 129#ifdef CONFIG_64BIT 130 int pages; /* Nr of partial slabs left */ 131 int pobjects; /* Approximate # of objects */ 132#else 133 short int pages; 134 short int pobjects; 135#endif 136 }; 137 138 struct rcu_head rcu_head; /* Used by SLAB 139 * when destroying via RCU 140 */ 141 /* Tail pages of compound page */ 142 struct { 143 unsigned long compound_head; /* If bit zero is set */ 144 145 /* First tail page only */ 146#ifdef CONFIG_64BIT 147 /* 148 * On 64 bit system we have enough space in struct page 149 * to encode compound_dtor and compound_order with 150 * unsigned int. It can help compiler generate better or 151 * smaller code on some archtectures. 152 */ 153 unsigned int compound_dtor; 154 unsigned int compound_order; 155#else 156 unsigned short int compound_dtor; 157 unsigned short int compound_order; 158#endif 159 }; 160 161#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS 162 struct { 163 unsigned long __pad; /* do not overlay pmd_huge_pte 164 * with compound_head to avoid 165 * possible bit 0 collision. 166 */ 167 pgtable_t pmd_huge_pte; /* protected by page->ptl */ 168 }; 169#endif 170 }; 171 172 /* Remainder is not double word aligned */ 173 union { 174 unsigned long private; /* Mapping-private opaque data: 175 * usually used for buffer_heads 176 * if PagePrivate set; used for 177 * swp_entry_t if PageSwapCache; 178 * indicates order in the buddy 179 * system if PG_buddy is set. 180 */ 181#if USE_SPLIT_PTE_PTLOCKS 182#if ALLOC_SPLIT_PTLOCKS 183 spinlock_t *ptl; 184#else 185 spinlock_t ptl; 186#endif 187#endif 188 struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */ 189 }; 190 191#ifdef CONFIG_MEMCG 192 struct mem_cgroup *mem_cgroup; 193#endif 194 195 /* 196 * On machines where all RAM is mapped into kernel address space, 197 * we can simply calculate the virtual address. On machines with 198 * highmem some memory is mapped into kernel virtual memory 199 * dynamically, so we need a place to store that address. 200 * Note that this field could be 16 bits on x86 ... ;) 201 * 202 * Architectures with slow multiplication can define 203 * WANT_PAGE_VIRTUAL in asm/page.h 204 */ 205#if defined(WANT_PAGE_VIRTUAL) 206 void *virtual; /* Kernel virtual address (NULL if 207 not kmapped, ie. highmem) */ 208#endif /* WANT_PAGE_VIRTUAL */ 209 210#ifdef CONFIG_KMEMCHECK 211 /* 212 * kmemcheck wants to track the status of each byte in a page; this 213 * is a pointer to such a status block. NULL if not tracked. 214 */ 215 void *shadow; 216#endif 217 218#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS 219 int _last_cpupid; 220#endif 221} 222/* 223 * The struct page can be forced to be double word aligned so that atomic ops 224 * on double words work. The SLUB allocator can make use of such a feature. 225 */ 226#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE 227 __aligned(2 * sizeof(unsigned long)) 228#endif 229; 230 231#define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK) 232#define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) 233 234struct page_frag_cache { 235 void * va; 236#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) 237 __u16 offset; 238 __u16 size; 239#else 240 __u32 offset; 241#endif 242 /* we maintain a pagecount bias, so that we dont dirty cache line 243 * containing page->_refcount every time we allocate a fragment. 244 */ 245 unsigned int pagecnt_bias; 246 bool pfmemalloc; 247}; 248 249typedef unsigned long vm_flags_t; 250 251/* 252 * A region containing a mapping of a non-memory backed file under NOMMU 253 * conditions. These are held in a global tree and are pinned by the VMAs that 254 * map parts of them. 255 */ 256struct vm_region { 257 struct rb_node vm_rb; /* link in global region tree */ 258 vm_flags_t vm_flags; /* VMA vm_flags */ 259 unsigned long vm_start; /* start address of region */ 260 unsigned long vm_end; /* region initialised to here */ 261 unsigned long vm_top; /* region allocated to here */ 262 unsigned long vm_pgoff; /* the offset in vm_file corresponding to vm_start */ 263 struct file *vm_file; /* the backing file or NULL */ 264 265 int vm_usage; /* region usage count (access under nommu_region_sem) */ 266 bool vm_icache_flushed : 1; /* true if the icache has been flushed for 267 * this region */ 268}; 269 270#ifdef CONFIG_USERFAULTFD 271#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, }) 272struct vm_userfaultfd_ctx { 273 struct userfaultfd_ctx *ctx; 274}; 275#else /* CONFIG_USERFAULTFD */ 276#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) {}) 277struct vm_userfaultfd_ctx {}; 278#endif /* CONFIG_USERFAULTFD */ 279 280/* 281 * This struct defines a memory VMM memory area. There is one of these 282 * per VM-area/task. A VM area is any part of the process virtual memory 283 * space that has a special rule for the page-fault handlers (ie a shared 284 * library, the executable area etc). 285 */ 286struct vm_area_struct { 287 /* The first cache line has the info for VMA tree walking. */ 288 289 unsigned long vm_start; /* Our start address within vm_mm. */ 290 unsigned long vm_end; /* The first byte after our end address 291 within vm_mm. */ 292 293 /* linked list of VM areas per task, sorted by address */ 294 struct vm_area_struct *vm_next, *vm_prev; 295 296 struct rb_node vm_rb; 297 298 /* 299 * Largest free memory gap in bytes to the left of this VMA. 300 * Either between this VMA and vma->vm_prev, or between one of the 301 * VMAs below us in the VMA rbtree and its ->vm_prev. This helps 302 * get_unmapped_area find a free area of the right size. 303 */ 304 unsigned long rb_subtree_gap; 305 306 /* Second cache line starts here. */ 307 308 struct mm_struct *vm_mm; /* The address space we belong to. */ 309 pgprot_t vm_page_prot; /* Access permissions of this VMA. */ 310 unsigned long vm_flags; /* Flags, see mm.h. */ 311 312 /* 313 * For areas with an address space and backing store, 314 * linkage into the address_space->i_mmap interval tree. 315 */ 316 struct { 317 struct rb_node rb; 318 unsigned long rb_subtree_last; 319 } shared; 320 321 /* 322 * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma 323 * list, after a COW of one of the file pages. A MAP_SHARED vma 324 * can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack 325 * or brk vma (with NULL file) can only be in an anon_vma list. 326 */ 327 struct list_head anon_vma_chain; /* Serialized by mmap_sem & 328 * page_table_lock */ 329 struct anon_vma *anon_vma; /* Serialized by page_table_lock */ 330 331 /* Function pointers to deal with this struct. */ 332 const struct vm_operations_struct *vm_ops; 333 334 /* Information about our backing store: */ 335 unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE 336 units */ 337 struct file * vm_file; /* File we map to (can be NULL). */ 338 void * vm_private_data; /* was vm_pte (shared mem) */ 339 340 atomic_long_t swap_readahead_info; 341#ifndef CONFIG_MMU 342 struct vm_region *vm_region; /* NOMMU mapping region */ 343#endif 344#ifdef CONFIG_NUMA 345 struct mempolicy *vm_policy; /* NUMA policy for the VMA */ 346#endif 347 struct vm_userfaultfd_ctx vm_userfaultfd_ctx; 348} __randomize_layout; 349 350struct core_thread { 351 struct task_struct *task; 352 struct core_thread *next; 353}; 354 355struct core_state { 356 atomic_t nr_threads; 357 struct core_thread dumper; 358 struct completion startup; 359}; 360 361struct kioctx_table; 362struct mm_struct { 363 struct vm_area_struct *mmap; /* list of VMAs */ 364 struct rb_root mm_rb; 365 u32 vmacache_seqnum; /* per-thread vmacache */ 366#ifdef CONFIG_MMU 367 unsigned long (*get_unmapped_area) (struct file *filp, 368 unsigned long addr, unsigned long len, 369 unsigned long pgoff, unsigned long flags); 370#endif 371 unsigned long mmap_base; /* base of mmap area */ 372 unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */ 373#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES 374 /* Base adresses for compatible mmap() */ 375 unsigned long mmap_compat_base; 376 unsigned long mmap_compat_legacy_base; 377#endif 378 unsigned long task_size; /* size of task vm space */ 379 unsigned long highest_vm_end; /* highest vma end address */ 380 pgd_t * pgd; 381 382 /** 383 * @mm_users: The number of users including userspace. 384 * 385 * Use mmget()/mmget_not_zero()/mmput() to modify. When this drops 386 * to 0 (i.e. when the task exits and there are no other temporary 387 * reference holders), we also release a reference on @mm_count 388 * (which may then free the &struct mm_struct if @mm_count also 389 * drops to 0). 390 */ 391 atomic_t mm_users; 392 393 /** 394 * @mm_count: The number of references to &struct mm_struct 395 * (@mm_users count as 1). 396 * 397 * Use mmgrab()/mmdrop() to modify. When this drops to 0, the 398 * &struct mm_struct is freed. 399 */ 400 atomic_t mm_count; 401 402 atomic_long_t nr_ptes; /* PTE page table pages */ 403#if CONFIG_PGTABLE_LEVELS > 2 404 atomic_long_t nr_pmds; /* PMD page table pages */ 405#endif 406 int map_count; /* number of VMAs */ 407 408 spinlock_t page_table_lock; /* Protects page tables and some counters */ 409 struct rw_semaphore mmap_sem; 410 411 struct list_head mmlist; /* List of maybe swapped mm's. These are globally strung 412 * together off init_mm.mmlist, and are protected 413 * by mmlist_lock 414 */ 415 416 417 unsigned long hiwater_rss; /* High-watermark of RSS usage */ 418 unsigned long hiwater_vm; /* High-water virtual memory usage */ 419 420 unsigned long total_vm; /* Total pages mapped */ 421 unsigned long locked_vm; /* Pages that have PG_mlocked set */ 422 unsigned long pinned_vm; /* Refcount permanently increased */ 423 unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ 424 unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ 425 unsigned long stack_vm; /* VM_STACK */ 426 unsigned long def_flags; 427 unsigned long start_code, end_code, start_data, end_data; 428 unsigned long start_brk, brk, start_stack; 429 unsigned long arg_start, arg_end, env_start, env_end; 430 431 unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ 432 433 /* 434 * Special counters, in some configurations protected by the 435 * page_table_lock, in other configurations by being atomic. 436 */ 437 struct mm_rss_stat rss_stat; 438 439 struct linux_binfmt *binfmt; 440 441 cpumask_var_t cpu_vm_mask_var; 442 443 /* Architecture-specific MM context */ 444 mm_context_t context; 445 446 unsigned long flags; /* Must use atomic bitops to access the bits */ 447 448 struct core_state *core_state; /* coredumping support */ 449#ifdef CONFIG_MEMBARRIER 450 atomic_t membarrier_state; 451#endif 452#ifdef CONFIG_AIO 453 spinlock_t ioctx_lock; 454 struct kioctx_table __rcu *ioctx_table; 455#endif 456#ifdef CONFIG_MEMCG 457 /* 458 * "owner" points to a task that is regarded as the canonical 459 * user/owner of this mm. All of the following must be true in 460 * order for it to be changed: 461 * 462 * current == mm->owner 463 * current->mm != mm 464 * new_owner->mm == mm 465 * new_owner->alloc_lock is held 466 */ 467 struct task_struct __rcu *owner; 468#endif 469 struct user_namespace *user_ns; 470 471 /* store ref to file /proc/<pid>/exe symlink points to */ 472 struct file __rcu *exe_file; 473#ifdef CONFIG_MMU_NOTIFIER 474 struct mmu_notifier_mm *mmu_notifier_mm; 475#endif 476#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS 477 pgtable_t pmd_huge_pte; /* protected by page_table_lock */ 478#endif 479#ifdef CONFIG_CPUMASK_OFFSTACK 480 struct cpumask cpumask_allocation; 481#endif 482#ifdef CONFIG_NUMA_BALANCING 483 /* 484 * numa_next_scan is the next time that the PTEs will be marked 485 * pte_numa. NUMA hinting faults will gather statistics and migrate 486 * pages to new nodes if necessary. 487 */ 488 unsigned long numa_next_scan; 489 490 /* Restart point for scanning and setting pte_numa */ 491 unsigned long numa_scan_offset; 492 493 /* numa_scan_seq prevents two threads setting pte_numa */ 494 int numa_scan_seq; 495#endif 496 /* 497 * An operation with batched TLB flushing is going on. Anything that 498 * can move process memory needs to flush the TLB when moving a 499 * PROT_NONE or PROT_NUMA mapped page. 500 */ 501 atomic_t tlb_flush_pending; 502#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH 503 /* See flush_tlb_batched_pending() */ 504 bool tlb_flush_batched; 505#endif 506 struct uprobes_state uprobes_state; 507#ifdef CONFIG_HUGETLB_PAGE 508 atomic_long_t hugetlb_usage; 509#endif 510 struct work_struct async_put_work; 511 512#if IS_ENABLED(CONFIG_HMM) 513 /* HMM needs to track a few things per mm */ 514 struct hmm *hmm; 515#endif 516} __randomize_layout; 517 518extern struct mm_struct init_mm; 519 520static inline void mm_init_cpumask(struct mm_struct *mm) 521{ 522#ifdef CONFIG_CPUMASK_OFFSTACK 523 mm->cpu_vm_mask_var = &mm->cpumask_allocation; 524#endif 525 cpumask_clear(mm->cpu_vm_mask_var); 526} 527 528/* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ 529static inline cpumask_t *mm_cpumask(struct mm_struct *mm) 530{ 531 return mm->cpu_vm_mask_var; 532} 533 534struct mmu_gather; 535extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, 536 unsigned long start, unsigned long end); 537extern void tlb_finish_mmu(struct mmu_gather *tlb, 538 unsigned long start, unsigned long end); 539 540static inline void init_tlb_flush_pending(struct mm_struct *mm) 541{ 542 atomic_set(&mm->tlb_flush_pending, 0); 543} 544 545static inline void inc_tlb_flush_pending(struct mm_struct *mm) 546{ 547 atomic_inc(&mm->tlb_flush_pending); 548 /* 549 * The only time this value is relevant is when there are indeed pages 550 * to flush. And we'll only flush pages after changing them, which 551 * requires the PTL. 552 * 553 * So the ordering here is: 554 * 555 * atomic_inc(&mm->tlb_flush_pending); 556 * spin_lock(&ptl); 557 * ... 558 * set_pte_at(); 559 * spin_unlock(&ptl); 560 * 561 * spin_lock(&ptl) 562 * mm_tlb_flush_pending(); 563 * .... 564 * spin_unlock(&ptl); 565 * 566 * flush_tlb_range(); 567 * atomic_dec(&mm->tlb_flush_pending); 568 * 569 * Where the increment if constrained by the PTL unlock, it thus 570 * ensures that the increment is visible if the PTE modification is 571 * visible. After all, if there is no PTE modification, nobody cares 572 * about TLB flushes either. 573 * 574 * This very much relies on users (mm_tlb_flush_pending() and 575 * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and 576 * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc 577 * locks (PPC) the unlock of one doesn't order against the lock of 578 * another PTL. 579 * 580 * The decrement is ordered by the flush_tlb_range(), such that 581 * mm_tlb_flush_pending() will not return false unless all flushes have 582 * completed. 583 */ 584} 585 586static inline void dec_tlb_flush_pending(struct mm_struct *mm) 587{ 588 /* 589 * See inc_tlb_flush_pending(). 590 * 591 * This cannot be smp_mb__before_atomic() because smp_mb() simply does 592 * not order against TLB invalidate completion, which is what we need. 593 * 594 * Therefore we must rely on tlb_flush_*() to guarantee order. 595 */ 596 atomic_dec(&mm->tlb_flush_pending); 597} 598 599static inline bool mm_tlb_flush_pending(struct mm_struct *mm) 600{ 601 /* 602 * Must be called after having acquired the PTL; orders against that 603 * PTLs release and therefore ensures that if we observe the modified 604 * PTE we must also observe the increment from inc_tlb_flush_pending(). 605 * 606 * That is, it only guarantees to return true if there is a flush 607 * pending for _this_ PTL. 608 */ 609 return atomic_read(&mm->tlb_flush_pending); 610} 611 612static inline bool mm_tlb_flush_nested(struct mm_struct *mm) 613{ 614 /* 615 * Similar to mm_tlb_flush_pending(), we must have acquired the PTL 616 * for which there is a TLB flush pending in order to guarantee 617 * we've seen both that PTE modification and the increment. 618 * 619 * (no requirement on actually still holding the PTL, that is irrelevant) 620 */ 621 return atomic_read(&mm->tlb_flush_pending) > 1; 622} 623 624struct vm_fault; 625 626struct vm_special_mapping { 627 const char *name; /* The name, e.g. "[vdso]". */ 628 629 /* 630 * If .fault is not provided, this points to a 631 * NULL-terminated array of pages that back the special mapping. 632 * 633 * This must not be NULL unless .fault is provided. 634 */ 635 struct page **pages; 636 637 /* 638 * If non-NULL, then this is called to resolve page faults 639 * on the special mapping. If used, .pages is not checked. 640 */ 641 int (*fault)(const struct vm_special_mapping *sm, 642 struct vm_area_struct *vma, 643 struct vm_fault *vmf); 644 645 int (*mremap)(const struct vm_special_mapping *sm, 646 struct vm_area_struct *new_vma); 647}; 648 649enum tlb_flush_reason { 650 TLB_FLUSH_ON_TASK_SWITCH, 651 TLB_REMOTE_SHOOTDOWN, 652 TLB_LOCAL_SHOOTDOWN, 653 TLB_LOCAL_MM_SHOOTDOWN, 654 TLB_REMOTE_SEND_IPI, 655 NR_TLB_FLUSH_REASONS, 656}; 657 658 /* 659 * A swap entry has to fit into a "unsigned long", as the entry is hidden 660 * in the "index" field of the swapper address space. 661 */ 662typedef struct { 663 unsigned long val; 664} swp_entry_t; 665 666#endif /* _LINUX_MM_TYPES_H */ 667