1
2
3
4
5
6
7
8
9
10
11
12
13
14
15#include <linux/anon_inodes.h>
16#include <linux/slab.h>
17#include <linux/sched/autogroup.h>
18#include <linux/sched/mm.h>
19#include <linux/sched/coredump.h>
20#include <linux/sched/user.h>
21#include <linux/sched/numa_balancing.h>
22#include <linux/sched/stat.h>
23#include <linux/sched/task.h>
24#include <linux/sched/task_stack.h>
25#include <linux/sched/cputime.h>
26#include <linux/seq_file.h>
27#include <linux/rtmutex.h>
28#include <linux/init.h>
29#include <linux/unistd.h>
30#include <linux/module.h>
31#include <linux/vmalloc.h>
32#include <linux/completion.h>
33#include <linux/personality.h>
34#include <linux/mempolicy.h>
35#include <linux/sem.h>
36#include <linux/file.h>
37#include <linux/fdtable.h>
38#include <linux/iocontext.h>
39#include <linux/key.h>
40#include <linux/binfmts.h>
41#include <linux/mman.h>
42#include <linux/mmu_notifier.h>
43#include <linux/fs.h>
44#include <linux/mm.h>
45#include <linux/vmacache.h>
46#include <linux/nsproxy.h>
47#include <linux/capability.h>
48#include <linux/cpu.h>
49#include <linux/cgroup.h>
50#include <linux/security.h>
51#include <linux/hugetlb.h>
52#include <linux/seccomp.h>
53#include <linux/swap.h>
54#include <linux/syscalls.h>
55#include <linux/jiffies.h>
56#include <linux/futex.h>
57#include <linux/compat.h>
58#include <linux/kthread.h>
59#include <linux/task_io_accounting_ops.h>
60#include <linux/rcupdate.h>
61#include <linux/ptrace.h>
62#include <linux/mount.h>
63#include <linux/audit.h>
64#include <linux/memcontrol.h>
65#include <linux/ftrace.h>
66#include <linux/proc_fs.h>
67#include <linux/profile.h>
68#include <linux/rmap.h>
69#include <linux/ksm.h>
70#include <linux/acct.h>
71#include <linux/userfaultfd_k.h>
72#include <linux/tsacct_kern.h>
73#include <linux/cn_proc.h>
74#include <linux/freezer.h>
75#include <linux/delayacct.h>
76#include <linux/taskstats_kern.h>
77#include <linux/random.h>
78#include <linux/tty.h>
79#include <linux/fs_struct.h>
80#include <linux/magic.h>
81#include <linux/perf_event.h>
82#include <linux/posix-timers.h>
83#include <linux/user-return-notifier.h>
84#include <linux/oom.h>
85#include <linux/khugepaged.h>
86#include <linux/signalfd.h>
87#include <linux/uprobes.h>
88#include <linux/aio.h>
89#include <linux/compiler.h>
90#include <linux/sysctl.h>
91#include <linux/kcov.h>
92#include <linux/livepatch.h>
93#include <linux/thread_info.h>
94#include <linux/stackleak.h>
95#include <linux/kasan.h>
96#include <linux/scs.h>
97#include <linux/io_uring.h>
98#include <linux/bpf.h>
99
100#include <asm/pgalloc.h>
101#include <linux/uaccess.h>
102#include <asm/mmu_context.h>
103#include <asm/cacheflush.h>
104#include <asm/tlbflush.h>
105
106#include <trace/events/sched.h>
107
108#define CREATE_TRACE_POINTS
109#include <trace/events/task.h>
110
111
112
113
114#define MIN_THREADS 20
115
116
117
118
119#define MAX_THREADS FUTEX_TID_MASK
120
121
122
123
124unsigned long total_forks;
125int nr_threads;
126
127static int max_threads;
128
129#define NAMED_ARRAY_INDEX(x) [x] = __stringify(x)
130
131static const char * const resident_page_types[] = {
132 NAMED_ARRAY_INDEX(MM_FILEPAGES),
133 NAMED_ARRAY_INDEX(MM_ANONPAGES),
134 NAMED_ARRAY_INDEX(MM_SWAPENTS),
135 NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
136};
137
138DEFINE_PER_CPU(unsigned long, process_counts) = 0;
139
140__cacheline_aligned DEFINE_RWLOCK(tasklist_lock);
141
142#ifdef CONFIG_PROVE_RCU
143int lockdep_tasklist_lock_is_held(void)
144{
145 return lockdep_is_held(&tasklist_lock);
146}
147EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
148#endif
149
150int nr_processes(void)
151{
152 int cpu;
153 int total = 0;
154
155 for_each_possible_cpu(cpu)
156 total += per_cpu(process_counts, cpu);
157
158 return total;
159}
160
161void __weak arch_release_task_struct(struct task_struct *tsk)
162{
163}
164
165#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
166static struct kmem_cache *task_struct_cachep;
167
168static inline struct task_struct *alloc_task_struct_node(int node)
169{
170 return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
171}
172
173static inline void free_task_struct(struct task_struct *tsk)
174{
175 kmem_cache_free(task_struct_cachep, tsk);
176}
177#endif
178
179#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
180
181
182
183
184
185# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
186
187#ifdef CONFIG_VMAP_STACK
188
189
190
191
192#define NR_CACHED_STACKS 2
193static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
194
195static int free_vm_stack_cache(unsigned int cpu)
196{
197 struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
198 int i;
199
200 for (i = 0; i < NR_CACHED_STACKS; i++) {
201 struct vm_struct *vm_stack = cached_vm_stacks[i];
202
203 if (!vm_stack)
204 continue;
205
206 vfree(vm_stack->addr);
207 cached_vm_stacks[i] = NULL;
208 }
209
210 return 0;
211}
212#endif
213
214static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
215{
216#ifdef CONFIG_VMAP_STACK
217 void *stack;
218 int i;
219
220 for (i = 0; i < NR_CACHED_STACKS; i++) {
221 struct vm_struct *s;
222
223 s = this_cpu_xchg(cached_stacks[i], NULL);
224
225 if (!s)
226 continue;
227
228
229 kasan_unpoison_range(s->addr, THREAD_SIZE);
230
231
232 memset(s->addr, 0, THREAD_SIZE);
233
234 tsk->stack_vm_area = s;
235 tsk->stack = s->addr;
236 return s->addr;
237 }
238
239
240
241
242
243
244 stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
245 VMALLOC_START, VMALLOC_END,
246 THREADINFO_GFP & ~__GFP_ACCOUNT,
247 PAGE_KERNEL,
248 0, node, __builtin_return_address(0));
249
250
251
252
253
254
255 if (stack) {
256 tsk->stack_vm_area = find_vm_area(stack);
257 tsk->stack = stack;
258 }
259 return stack;
260#else
261 struct page *page = alloc_pages_node(node, THREADINFO_GFP,
262 THREAD_SIZE_ORDER);
263
264 if (likely(page)) {
265 tsk->stack = kasan_reset_tag(page_address(page));
266 return tsk->stack;
267 }
268 return NULL;
269#endif
270}
271
272static inline void free_thread_stack(struct task_struct *tsk)
273{
274#ifdef CONFIG_VMAP_STACK
275 struct vm_struct *vm = task_stack_vm_area(tsk);
276
277 if (vm) {
278 int i;
279
280 for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
281 memcg_kmem_uncharge_page(vm->pages[i], 0);
282
283 for (i = 0; i < NR_CACHED_STACKS; i++) {
284 if (this_cpu_cmpxchg(cached_stacks[i],
285 NULL, tsk->stack_vm_area) != NULL)
286 continue;
287
288 return;
289 }
290
291 vfree_atomic(tsk->stack);
292 return;
293 }
294#endif
295
296 __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
297}
298# else
299static struct kmem_cache *thread_stack_cache;
300
301static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
302 int node)
303{
304 unsigned long *stack;
305 stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
306 stack = kasan_reset_tag(stack);
307 tsk->stack = stack;
308 return stack;
309}
310
311static void free_thread_stack(struct task_struct *tsk)
312{
313 kmem_cache_free(thread_stack_cache, tsk->stack);
314}
315
316void thread_stack_cache_init(void)
317{
318 thread_stack_cache = kmem_cache_create_usercopy("thread_stack",
319 THREAD_SIZE, THREAD_SIZE, 0, 0,
320 THREAD_SIZE, NULL);
321 BUG_ON(thread_stack_cache == NULL);
322}
323# endif
324#endif
325
326
327static struct kmem_cache *signal_cachep;
328
329
330struct kmem_cache *sighand_cachep;
331
332
333struct kmem_cache *files_cachep;
334
335
336struct kmem_cache *fs_cachep;
337
338
339static struct kmem_cache *vm_area_cachep;
340
341
342static struct kmem_cache *mm_cachep;
343
344struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
345{
346 struct vm_area_struct *vma;
347
348 vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
349 if (vma)
350 vma_init(vma, mm);
351 return vma;
352}
353
354struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
355{
356 struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
357
358 if (new) {
359 ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
360 ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
361
362
363
364
365 *new = data_race(*orig);
366 INIT_LIST_HEAD(&new->anon_vma_chain);
367 new->vm_next = new->vm_prev = NULL;
368 }
369 return new;
370}
371
372void vm_area_free(struct vm_area_struct *vma)
373{
374 kmem_cache_free(vm_area_cachep, vma);
375}
376
377static void account_kernel_stack(struct task_struct *tsk, int account)
378{
379 void *stack = task_stack_page(tsk);
380 struct vm_struct *vm = task_stack_vm_area(tsk);
381
382 if (vm) {
383 int i;
384
385 for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
386 mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB,
387 account * (PAGE_SIZE / 1024));
388 } else {
389
390 mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB,
391 account * (THREAD_SIZE / 1024));
392 }
393}
394
395static int memcg_charge_kernel_stack(struct task_struct *tsk)
396{
397#ifdef CONFIG_VMAP_STACK
398 struct vm_struct *vm = task_stack_vm_area(tsk);
399 int ret;
400
401 BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
402
403 if (vm) {
404 int i;
405
406 BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
407
408 for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
409
410
411
412
413
414
415 ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
416 0);
417 if (ret)
418 return ret;
419 }
420 }
421#endif
422 return 0;
423}
424
425static void release_task_stack(struct task_struct *tsk)
426{
427 if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
428 return;
429
430 account_kernel_stack(tsk, -1);
431 free_thread_stack(tsk);
432 tsk->stack = NULL;
433#ifdef CONFIG_VMAP_STACK
434 tsk->stack_vm_area = NULL;
435#endif
436}
437
438#ifdef CONFIG_THREAD_INFO_IN_TASK
439void put_task_stack(struct task_struct *tsk)
440{
441 if (refcount_dec_and_test(&tsk->stack_refcount))
442 release_task_stack(tsk);
443}
444#endif
445
446void free_task(struct task_struct *tsk)
447{
448 release_user_cpus_ptr(tsk);
449 scs_release(tsk);
450
451#ifndef CONFIG_THREAD_INFO_IN_TASK
452
453
454
455
456 release_task_stack(tsk);
457#else
458
459
460
461
462 WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
463#endif
464 rt_mutex_debug_task_free(tsk);
465 ftrace_graph_exit_task(tsk);
466 arch_release_task_struct(tsk);
467 if (tsk->flags & PF_KTHREAD)
468 free_kthread_struct(tsk);
469 free_task_struct(tsk);
470}
471EXPORT_SYMBOL(free_task);
472
473static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
474{
475 struct file *exe_file;
476
477 exe_file = get_mm_exe_file(oldmm);
478 RCU_INIT_POINTER(mm->exe_file, exe_file);
479
480
481
482
483 if (exe_file && deny_write_access(exe_file))
484 pr_warn_once("deny_write_access() failed in %s\n", __func__);
485}
486
487#ifdef CONFIG_MMU
488static __latent_entropy int dup_mmap(struct mm_struct *mm,
489 struct mm_struct *oldmm)
490{
491 struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
492 struct rb_node **rb_link, *rb_parent;
493 int retval;
494 unsigned long charge;
495 LIST_HEAD(uf);
496
497 uprobe_start_dup_mmap();
498 if (mmap_write_lock_killable(oldmm)) {
499 retval = -EINTR;
500 goto fail_uprobe_end;
501 }
502 flush_cache_dup_mm(oldmm);
503 uprobe_dup_mmap(oldmm, mm);
504
505
506
507 mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
508
509
510 dup_mm_exe_file(mm, oldmm);
511
512 mm->total_vm = oldmm->total_vm;
513 mm->data_vm = oldmm->data_vm;
514 mm->exec_vm = oldmm->exec_vm;
515 mm->stack_vm = oldmm->stack_vm;
516
517 rb_link = &mm->mm_rb.rb_node;
518 rb_parent = NULL;
519 pprev = &mm->mmap;
520 retval = ksm_fork(mm, oldmm);
521 if (retval)
522 goto out;
523 retval = khugepaged_fork(mm, oldmm);
524 if (retval)
525 goto out;
526
527 prev = NULL;
528 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
529 struct file *file;
530
531 if (mpnt->vm_flags & VM_DONTCOPY) {
532 vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
533 continue;
534 }
535 charge = 0;
536
537
538
539
540 if (fatal_signal_pending(current)) {
541 retval = -EINTR;
542 goto out;
543 }
544 if (mpnt->vm_flags & VM_ACCOUNT) {
545 unsigned long len = vma_pages(mpnt);
546
547 if (security_vm_enough_memory_mm(oldmm, len))
548 goto fail_nomem;
549 charge = len;
550 }
551 tmp = vm_area_dup(mpnt);
552 if (!tmp)
553 goto fail_nomem;
554 retval = vma_dup_policy(mpnt, tmp);
555 if (retval)
556 goto fail_nomem_policy;
557 tmp->vm_mm = mm;
558 retval = dup_userfaultfd(tmp, &uf);
559 if (retval)
560 goto fail_nomem_anon_vma_fork;
561 if (tmp->vm_flags & VM_WIPEONFORK) {
562
563
564
565
566
567 tmp->anon_vma = NULL;
568 } else if (anon_vma_fork(tmp, mpnt))
569 goto fail_nomem_anon_vma_fork;
570 tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
571 file = tmp->vm_file;
572 if (file) {
573 struct address_space *mapping = file->f_mapping;
574
575 get_file(file);
576 i_mmap_lock_write(mapping);
577 if (tmp->vm_flags & VM_SHARED)
578 mapping_allow_writable(mapping);
579 flush_dcache_mmap_lock(mapping);
580
581 vma_interval_tree_insert_after(tmp, mpnt,
582 &mapping->i_mmap);
583 flush_dcache_mmap_unlock(mapping);
584 i_mmap_unlock_write(mapping);
585 }
586
587
588
589
590
591
592 if (is_vm_hugetlb_page(tmp))
593 reset_vma_resv_huge_pages(tmp);
594
595
596
597
598 *pprev = tmp;
599 pprev = &tmp->vm_next;
600 tmp->vm_prev = prev;
601 prev = tmp;
602
603 __vma_link_rb(mm, tmp, rb_link, rb_parent);
604 rb_link = &tmp->vm_rb.rb_right;
605 rb_parent = &tmp->vm_rb;
606
607 mm->map_count++;
608 if (!(tmp->vm_flags & VM_WIPEONFORK))
609 retval = copy_page_range(tmp, mpnt);
610
611 if (tmp->vm_ops && tmp->vm_ops->open)
612 tmp->vm_ops->open(tmp);
613
614 if (retval)
615 goto out;
616 }
617
618 retval = arch_dup_mmap(oldmm, mm);
619out:
620 mmap_write_unlock(mm);
621 flush_tlb_mm(oldmm);
622 mmap_write_unlock(oldmm);
623 dup_userfaultfd_complete(&uf);
624fail_uprobe_end:
625 uprobe_end_dup_mmap();
626 return retval;
627fail_nomem_anon_vma_fork:
628 mpol_put(vma_policy(tmp));
629fail_nomem_policy:
630 vm_area_free(tmp);
631fail_nomem:
632 retval = -ENOMEM;
633 vm_unacct_memory(charge);
634 goto out;
635}
636
637static inline int mm_alloc_pgd(struct mm_struct *mm)
638{
639 mm->pgd = pgd_alloc(mm);
640 if (unlikely(!mm->pgd))
641 return -ENOMEM;
642 return 0;
643}
644
645static inline void mm_free_pgd(struct mm_struct *mm)
646{
647 pgd_free(mm, mm->pgd);
648}
649#else
650static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
651{
652 mmap_write_lock(oldmm);
653 dup_mm_exe_file(mm, oldmm);
654 mmap_write_unlock(oldmm);
655 return 0;
656}
657#define mm_alloc_pgd(mm) (0)
658#define mm_free_pgd(mm)
659#endif
660
661static void check_mm(struct mm_struct *mm)
662{
663 int i;
664
665 BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
666 "Please make sure 'struct resident_page_types[]' is updated as well");
667
668 for (i = 0; i < NR_MM_COUNTERS; i++) {
669 long x = atomic_long_read(&mm->rss_stat.count[i]);
670
671 if (unlikely(x))
672 pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
673 mm, resident_page_types[i], x);
674 }
675
676 if (mm_pgtables_bytes(mm))
677 pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
678 mm_pgtables_bytes(mm));
679
680#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
681 VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
682#endif
683}
684
685#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
686#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
687
688
689
690
691
692
693void __mmdrop(struct mm_struct *mm)
694{
695 BUG_ON(mm == &init_mm);
696 WARN_ON_ONCE(mm == current->mm);
697 WARN_ON_ONCE(mm == current->active_mm);
698 mm_free_pgd(mm);
699 destroy_context(mm);
700 mmu_notifier_subscriptions_destroy(mm);
701 check_mm(mm);
702 put_user_ns(mm->user_ns);
703 free_mm(mm);
704}
705EXPORT_SYMBOL_GPL(__mmdrop);
706
707static void mmdrop_async_fn(struct work_struct *work)
708{
709 struct mm_struct *mm;
710
711 mm = container_of(work, struct mm_struct, async_put_work);
712 __mmdrop(mm);
713}
714
715static void mmdrop_async(struct mm_struct *mm)
716{
717 if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
718 INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
719 schedule_work(&mm->async_put_work);
720 }
721}
722
723static inline void free_signal_struct(struct signal_struct *sig)
724{
725 taskstats_tgid_free(sig);
726 sched_autogroup_exit(sig);
727
728
729
730
731 if (sig->oom_mm)
732 mmdrop_async(sig->oom_mm);
733 kmem_cache_free(signal_cachep, sig);
734}
735
736static inline void put_signal_struct(struct signal_struct *sig)
737{
738 if (refcount_dec_and_test(&sig->sigcnt))
739 free_signal_struct(sig);
740}
741
742void __put_task_struct(struct task_struct *tsk)
743{
744 WARN_ON(!tsk->exit_state);
745 WARN_ON(refcount_read(&tsk->usage));
746 WARN_ON(tsk == current);
747
748 io_uring_free(tsk);
749 cgroup_free(tsk);
750 task_numa_free(tsk, true);
751 security_task_free(tsk);
752 bpf_task_storage_free(tsk);
753 exit_creds(tsk);
754 delayacct_tsk_free(tsk);
755 put_signal_struct(tsk->signal);
756 sched_core_free(tsk);
757
758 if (!profile_handoff_task(tsk))
759 free_task(tsk);
760}
761EXPORT_SYMBOL_GPL(__put_task_struct);
762
763void __init __weak arch_task_cache_init(void) { }
764
765
766
767
768static void set_max_threads(unsigned int max_threads_suggested)
769{
770 u64 threads;
771 unsigned long nr_pages = totalram_pages();
772
773
774
775
776
777 if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64)
778 threads = MAX_THREADS;
779 else
780 threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE,
781 (u64) THREAD_SIZE * 8UL);
782
783 if (threads > max_threads_suggested)
784 threads = max_threads_suggested;
785
786 max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
787}
788
789#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
790
791int arch_task_struct_size __read_mostly;
792#endif
793
794#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
795static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
796{
797
798 arch_thread_struct_whitelist(offset, size);
799
800
801
802
803
804 if (unlikely(*size == 0))
805 *offset = 0;
806 else
807 *offset += offsetof(struct task_struct, thread);
808}
809#endif
810
811void __init fork_init(void)
812{
813 int i;
814#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
815#ifndef ARCH_MIN_TASKALIGN
816#define ARCH_MIN_TASKALIGN 0
817#endif
818 int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
819 unsigned long useroffset, usersize;
820
821
822 task_struct_whitelist(&useroffset, &usersize);
823 task_struct_cachep = kmem_cache_create_usercopy("task_struct",
824 arch_task_struct_size, align,
825 SLAB_PANIC|SLAB_ACCOUNT,
826 useroffset, usersize, NULL);
827#endif
828
829
830 arch_task_cache_init();
831
832 set_max_threads(MAX_THREADS);
833
834 init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
835 init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
836 init_task.signal->rlim[RLIMIT_SIGPENDING] =
837 init_task.signal->rlim[RLIMIT_NPROC];
838
839 for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++)
840 init_user_ns.ucount_max[i] = max_threads/2;
841
842 set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, RLIM_INFINITY);
843 set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, RLIM_INFINITY);
844 set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY);
845 set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, RLIM_INFINITY);
846
847#ifdef CONFIG_VMAP_STACK
848 cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
849 NULL, free_vm_stack_cache);
850#endif
851
852 scs_init();
853
854 lockdep_init_task(&init_task);
855 uprobes_init();
856}
857
858int __weak arch_dup_task_struct(struct task_struct *dst,
859 struct task_struct *src)
860{
861 *dst = *src;
862 return 0;
863}
864
865void set_task_stack_end_magic(struct task_struct *tsk)
866{
867 unsigned long *stackend;
868
869 stackend = end_of_stack(tsk);
870 *stackend = STACK_END_MAGIC;
871}
872
873static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
874{
875 struct task_struct *tsk;
876 unsigned long *stack;
877 struct vm_struct *stack_vm_area __maybe_unused;
878 int err;
879
880 if (node == NUMA_NO_NODE)
881 node = tsk_fork_get_node(orig);
882 tsk = alloc_task_struct_node(node);
883 if (!tsk)
884 return NULL;
885
886 stack = alloc_thread_stack_node(tsk, node);
887 if (!stack)
888 goto free_tsk;
889
890 if (memcg_charge_kernel_stack(tsk))
891 goto free_stack;
892
893 stack_vm_area = task_stack_vm_area(tsk);
894
895 err = arch_dup_task_struct(tsk, orig);
896
897
898
899
900
901
902 tsk->stack = stack;
903#ifdef CONFIG_VMAP_STACK
904 tsk->stack_vm_area = stack_vm_area;
905#endif
906#ifdef CONFIG_THREAD_INFO_IN_TASK
907 refcount_set(&tsk->stack_refcount, 1);
908#endif
909
910 if (err)
911 goto free_stack;
912
913 err = scs_prepare(tsk, node);
914 if (err)
915 goto free_stack;
916
917#ifdef CONFIG_SECCOMP
918
919
920
921
922
923
924 tsk->seccomp.filter = NULL;
925#endif
926
927 setup_thread_stack(tsk, orig);
928 clear_user_return_notifier(tsk);
929 clear_tsk_need_resched(tsk);
930 set_task_stack_end_magic(tsk);
931 clear_syscall_work_syscall_user_dispatch(tsk);
932
933#ifdef CONFIG_STACKPROTECTOR
934 tsk->stack_canary = get_random_canary();
935#endif
936 if (orig->cpus_ptr == &orig->cpus_mask)
937 tsk->cpus_ptr = &tsk->cpus_mask;
938 dup_user_cpus_ptr(tsk, orig, node);
939
940
941
942
943
944 refcount_set(&tsk->rcu_users, 2);
945
946 refcount_set(&tsk->usage, 1);
947#ifdef CONFIG_BLK_DEV_IO_TRACE
948 tsk->btrace_seq = 0;
949#endif
950 tsk->splice_pipe = NULL;
951 tsk->task_frag.page = NULL;
952 tsk->wake_q.next = NULL;
953 tsk->pf_io_worker = NULL;
954
955 account_kernel_stack(tsk, 1);
956
957 kcov_task_init(tsk);
958 kmap_local_fork(tsk);
959
960#ifdef CONFIG_FAULT_INJECTION
961 tsk->fail_nth = 0;
962#endif
963
964#ifdef CONFIG_BLK_CGROUP
965 tsk->throttle_queue = NULL;
966 tsk->use_memdelay = 0;
967#endif
968
969#ifdef CONFIG_MEMCG
970 tsk->active_memcg = NULL;
971#endif
972 return tsk;
973
974free_stack:
975 free_thread_stack(tsk);
976free_tsk:
977 free_task_struct(tsk);
978 return NULL;
979}
980
981__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
982
983static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
984
985static int __init coredump_filter_setup(char *s)
986{
987 default_dump_filter =
988 (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
989 MMF_DUMP_FILTER_MASK;
990 return 1;
991}
992
993__setup("coredump_filter=", coredump_filter_setup);
994
995#include <linux/init_task.h>
996
997static void mm_init_aio(struct mm_struct *mm)
998{
999#ifdef CONFIG_AIO
1000 spin_lock_init(&mm->ioctx_lock);
1001 mm->ioctx_table = NULL;
1002#endif
1003}
1004
1005static __always_inline void mm_clear_owner(struct mm_struct *mm,
1006 struct task_struct *p)
1007{
1008#ifdef CONFIG_MEMCG
1009 if (mm->owner == p)
1010 WRITE_ONCE(mm->owner, NULL);
1011#endif
1012}
1013
1014static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
1015{
1016#ifdef CONFIG_MEMCG
1017 mm->owner = p;
1018#endif
1019}
1020
1021static void mm_init_pasid(struct mm_struct *mm)
1022{
1023#ifdef CONFIG_IOMMU_SUPPORT
1024 mm->pasid = INIT_PASID;
1025#endif
1026}
1027
1028static void mm_init_uprobes_state(struct mm_struct *mm)
1029{
1030#ifdef CONFIG_UPROBES
1031 mm->uprobes_state.xol_area = NULL;
1032#endif
1033}
1034
1035static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
1036 struct user_namespace *user_ns)
1037{
1038 mm->mmap = NULL;
1039 mm->mm_rb = RB_ROOT;
1040 mm->vmacache_seqnum = 0;
1041 atomic_set(&mm->mm_users, 1);
1042 atomic_set(&mm->mm_count, 1);
1043 seqcount_init(&mm->write_protect_seq);
1044 mmap_init_lock(mm);
1045 INIT_LIST_HEAD(&mm->mmlist);
1046 mm_pgtables_bytes_init(mm);
1047 mm->map_count = 0;
1048 mm->locked_vm = 0;
1049 atomic64_set(&mm->pinned_vm, 0);
1050 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
1051 spin_lock_init(&mm->page_table_lock);
1052 spin_lock_init(&mm->arg_lock);
1053 mm_init_cpumask(mm);
1054 mm_init_aio(mm);
1055 mm_init_owner(mm, p);
1056 mm_init_pasid(mm);
1057 RCU_INIT_POINTER(mm->exe_file, NULL);
1058 mmu_notifier_subscriptions_init(mm);
1059 init_tlb_flush_pending(mm);
1060#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
1061 mm->pmd_huge_pte = NULL;
1062#endif
1063 mm_init_uprobes_state(mm);
1064 hugetlb_count_init(mm);
1065
1066 if (current->mm) {
1067 mm->flags = current->mm->flags & MMF_INIT_MASK;
1068 mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
1069 } else {
1070 mm->flags = default_dump_filter;
1071 mm->def_flags = 0;
1072 }
1073
1074 if (mm_alloc_pgd(mm))
1075 goto fail_nopgd;
1076
1077 if (init_new_context(p, mm))
1078 goto fail_nocontext;
1079
1080 mm->user_ns = get_user_ns(user_ns);
1081 return mm;
1082
1083fail_nocontext:
1084 mm_free_pgd(mm);
1085fail_nopgd:
1086 free_mm(mm);
1087 return NULL;
1088}
1089
1090
1091
1092
1093struct mm_struct *mm_alloc(void)
1094{
1095 struct mm_struct *mm;
1096
1097 mm = allocate_mm();
1098 if (!mm)
1099 return NULL;
1100
1101 memset(mm, 0, sizeof(*mm));
1102 return mm_init(mm, current, current_user_ns());
1103}
1104
1105static inline void __mmput(struct mm_struct *mm)
1106{
1107 VM_BUG_ON(atomic_read(&mm->mm_users));
1108
1109 uprobe_clear_state(mm);
1110 exit_aio(mm);
1111 ksm_exit(mm);
1112 khugepaged_exit(mm);
1113 exit_mmap(mm);
1114 mm_put_huge_zero_page(mm);
1115 set_mm_exe_file(mm, NULL);
1116 if (!list_empty(&mm->mmlist)) {
1117 spin_lock(&mmlist_lock);
1118 list_del(&mm->mmlist);
1119 spin_unlock(&mmlist_lock);
1120 }
1121 if (mm->binfmt)
1122 module_put(mm->binfmt->module);
1123 mmdrop(mm);
1124}
1125
1126
1127
1128
1129void mmput(struct mm_struct *mm)
1130{
1131 might_sleep();
1132
1133 if (atomic_dec_and_test(&mm->mm_users))
1134 __mmput(mm);
1135}
1136EXPORT_SYMBOL_GPL(mmput);
1137
1138#ifdef CONFIG_MMU
1139static void mmput_async_fn(struct work_struct *work)
1140{
1141 struct mm_struct *mm = container_of(work, struct mm_struct,
1142 async_put_work);
1143
1144 __mmput(mm);
1145}
1146
1147void mmput_async(struct mm_struct *mm)
1148{
1149 if (atomic_dec_and_test(&mm->mm_users)) {
1150 INIT_WORK(&mm->async_put_work, mmput_async_fn);
1151 schedule_work(&mm->async_put_work);
1152 }
1153}
1154#endif
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
1168{
1169 struct file *old_exe_file;
1170
1171
1172
1173
1174
1175
1176 old_exe_file = rcu_dereference_raw(mm->exe_file);
1177
1178 if (new_exe_file) {
1179
1180
1181
1182
1183 if (unlikely(deny_write_access(new_exe_file)))
1184 return -EACCES;
1185 get_file(new_exe_file);
1186 }
1187 rcu_assign_pointer(mm->exe_file, new_exe_file);
1188 if (old_exe_file) {
1189 allow_write_access(old_exe_file);
1190 fput(old_exe_file);
1191 }
1192 return 0;
1193}
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
1205{
1206 struct vm_area_struct *vma;
1207 struct file *old_exe_file;
1208 int ret = 0;
1209
1210
1211 old_exe_file = get_mm_exe_file(mm);
1212 if (old_exe_file) {
1213 mmap_read_lock(mm);
1214 for (vma = mm->mmap; vma && !ret; vma = vma->vm_next) {
1215 if (!vma->vm_file)
1216 continue;
1217 if (path_equal(&vma->vm_file->f_path,
1218 &old_exe_file->f_path))
1219 ret = -EBUSY;
1220 }
1221 mmap_read_unlock(mm);
1222 fput(old_exe_file);
1223 if (ret)
1224 return ret;
1225 }
1226
1227
1228 ret = deny_write_access(new_exe_file);
1229 if (ret)
1230 return -EACCES;
1231 get_file(new_exe_file);
1232
1233 old_exe_file = xchg(&mm->exe_file, new_exe_file);
1234 if (old_exe_file) {
1235
1236
1237
1238
1239 mmap_read_lock(mm);
1240 allow_write_access(old_exe_file);
1241 fput(old_exe_file);
1242 mmap_read_unlock(mm);
1243 }
1244 return 0;
1245}
1246
1247
1248
1249
1250
1251
1252
1253struct file *get_mm_exe_file(struct mm_struct *mm)
1254{
1255 struct file *exe_file;
1256
1257 rcu_read_lock();
1258 exe_file = rcu_dereference(mm->exe_file);
1259 if (exe_file && !get_file_rcu(exe_file))
1260 exe_file = NULL;
1261 rcu_read_unlock();
1262 return exe_file;
1263}
1264
1265
1266
1267
1268
1269
1270
1271
1272struct file *get_task_exe_file(struct task_struct *task)
1273{
1274 struct file *exe_file = NULL;
1275 struct mm_struct *mm;
1276
1277 task_lock(task);
1278 mm = task->mm;
1279 if (mm) {
1280 if (!(task->flags & PF_KTHREAD))
1281 exe_file = get_mm_exe_file(mm);
1282 }
1283 task_unlock(task);
1284 return exe_file;
1285}
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296struct mm_struct *get_task_mm(struct task_struct *task)
1297{
1298 struct mm_struct *mm;
1299
1300 task_lock(task);
1301 mm = task->mm;
1302 if (mm) {
1303 if (task->flags & PF_KTHREAD)
1304 mm = NULL;
1305 else
1306 mmget(mm);
1307 }
1308 task_unlock(task);
1309 return mm;
1310}
1311EXPORT_SYMBOL_GPL(get_task_mm);
1312
1313struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
1314{
1315 struct mm_struct *mm;
1316 int err;
1317
1318 err = down_read_killable(&task->signal->exec_update_lock);
1319 if (err)
1320 return ERR_PTR(err);
1321
1322 mm = get_task_mm(task);
1323 if (mm && mm != current->mm &&
1324 !ptrace_may_access(task, mode)) {
1325 mmput(mm);
1326 mm = ERR_PTR(-EACCES);
1327 }
1328 up_read(&task->signal->exec_update_lock);
1329
1330 return mm;
1331}
1332
1333static void complete_vfork_done(struct task_struct *tsk)
1334{
1335 struct completion *vfork;
1336
1337 task_lock(tsk);
1338 vfork = tsk->vfork_done;
1339 if (likely(vfork)) {
1340 tsk->vfork_done = NULL;
1341 complete(vfork);
1342 }
1343 task_unlock(tsk);
1344}
1345
1346static int wait_for_vfork_done(struct task_struct *child,
1347 struct completion *vfork)
1348{
1349 int killed;
1350
1351 freezer_do_not_count();
1352 cgroup_enter_frozen();
1353 killed = wait_for_completion_killable(vfork);
1354 cgroup_leave_frozen(false);
1355 freezer_count();
1356
1357 if (killed) {
1358 task_lock(child);
1359 child->vfork_done = NULL;
1360 task_unlock(child);
1361 }
1362
1363 put_task_struct(child);
1364 return killed;
1365}
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
1381{
1382 uprobe_free_utask(tsk);
1383
1384
1385 deactivate_mm(tsk, mm);
1386
1387
1388
1389
1390
1391
1392 if (tsk->clear_child_tid) {
1393 if (atomic_read(&mm->mm_users) > 1) {
1394
1395
1396
1397
1398 put_user(0, tsk->clear_child_tid);
1399 do_futex(tsk->clear_child_tid, FUTEX_WAKE,
1400 1, NULL, NULL, 0, 0);
1401 }
1402 tsk->clear_child_tid = NULL;
1403 }
1404
1405
1406
1407
1408
1409 if (tsk->vfork_done)
1410 complete_vfork_done(tsk);
1411}
1412
1413void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm)
1414{
1415 futex_exit_release(tsk);
1416 mm_release(tsk, mm);
1417}
1418
1419void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm)
1420{
1421 futex_exec_release(tsk);
1422 mm_release(tsk, mm);
1423}
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435static struct mm_struct *dup_mm(struct task_struct *tsk,
1436 struct mm_struct *oldmm)
1437{
1438 struct mm_struct *mm;
1439 int err;
1440
1441 mm = allocate_mm();
1442 if (!mm)
1443 goto fail_nomem;
1444
1445 memcpy(mm, oldmm, sizeof(*mm));
1446
1447 if (!mm_init(mm, tsk, mm->user_ns))
1448 goto fail_nomem;
1449
1450 err = dup_mmap(mm, oldmm);
1451 if (err)
1452 goto free_pt;
1453
1454 mm->hiwater_rss = get_mm_rss(mm);
1455 mm->hiwater_vm = mm->total_vm;
1456
1457 if (mm->binfmt && !try_module_get(mm->binfmt->module))
1458 goto free_pt;
1459
1460 return mm;
1461
1462free_pt:
1463
1464 mm->binfmt = NULL;
1465 mm_init_owner(mm, NULL);
1466 mmput(mm);
1467
1468fail_nomem:
1469 return NULL;
1470}
1471
1472static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
1473{
1474 struct mm_struct *mm, *oldmm;
1475
1476 tsk->min_flt = tsk->maj_flt = 0;
1477 tsk->nvcsw = tsk->nivcsw = 0;
1478#ifdef CONFIG_DETECT_HUNG_TASK
1479 tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
1480 tsk->last_switch_time = 0;
1481#endif
1482
1483 tsk->mm = NULL;
1484 tsk->active_mm = NULL;
1485
1486
1487
1488
1489
1490
1491 oldmm = current->mm;
1492 if (!oldmm)
1493 return 0;
1494
1495
1496 vmacache_flush(tsk);
1497
1498 if (clone_flags & CLONE_VM) {
1499 mmget(oldmm);
1500 mm = oldmm;
1501 } else {
1502 mm = dup_mm(tsk, current->mm);
1503 if (!mm)
1504 return -ENOMEM;
1505 }
1506
1507 tsk->mm = mm;
1508 tsk->active_mm = mm;
1509 return 0;
1510}
1511
1512static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
1513{
1514 struct fs_struct *fs = current->fs;
1515 if (clone_flags & CLONE_FS) {
1516
1517 spin_lock(&fs->lock);
1518 if (fs->in_exec) {
1519 spin_unlock(&fs->lock);
1520 return -EAGAIN;
1521 }
1522 fs->users++;
1523 spin_unlock(&fs->lock);
1524 return 0;
1525 }
1526 tsk->fs = copy_fs_struct(fs);
1527 if (!tsk->fs)
1528 return -ENOMEM;
1529 return 0;
1530}
1531
1532static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
1533{
1534 struct files_struct *oldf, *newf;
1535 int error = 0;
1536
1537
1538
1539
1540 oldf = current->files;
1541 if (!oldf)
1542 goto out;
1543
1544 if (clone_flags & CLONE_FILES) {
1545 atomic_inc(&oldf->count);
1546 goto out;
1547 }
1548
1549 newf = dup_fd(oldf, NR_OPEN_MAX, &error);
1550 if (!newf)
1551 goto out;
1552
1553 tsk->files = newf;
1554 error = 0;
1555out:
1556 return error;
1557}
1558
1559static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
1560{
1561#ifdef CONFIG_BLOCK
1562 struct io_context *ioc = current->io_context;
1563 struct io_context *new_ioc;
1564
1565 if (!ioc)
1566 return 0;
1567
1568
1569
1570 if (clone_flags & CLONE_IO) {
1571 ioc_task_link(ioc);
1572 tsk->io_context = ioc;
1573 } else if (ioprio_valid(ioc->ioprio)) {
1574 new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
1575 if (unlikely(!new_ioc))
1576 return -ENOMEM;
1577
1578 new_ioc->ioprio = ioc->ioprio;
1579 put_io_context(new_ioc);
1580 }
1581#endif
1582 return 0;
1583}
1584
1585static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
1586{
1587 struct sighand_struct *sig;
1588
1589 if (clone_flags & CLONE_SIGHAND) {
1590 refcount_inc(¤t->sighand->count);
1591 return 0;
1592 }
1593 sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
1594 RCU_INIT_POINTER(tsk->sighand, sig);
1595 if (!sig)
1596 return -ENOMEM;
1597
1598 refcount_set(&sig->count, 1);
1599 spin_lock_irq(¤t->sighand->siglock);
1600 memcpy(sig->action, current->sighand->action, sizeof(sig->action));
1601 spin_unlock_irq(¤t->sighand->siglock);
1602
1603
1604 if (clone_flags & CLONE_CLEAR_SIGHAND)
1605 flush_signal_handlers(tsk, 0);
1606
1607 return 0;
1608}
1609
1610void __cleanup_sighand(struct sighand_struct *sighand)
1611{
1612 if (refcount_dec_and_test(&sighand->count)) {
1613 signalfd_cleanup(sighand);
1614
1615
1616
1617
1618 kmem_cache_free(sighand_cachep, sighand);
1619 }
1620}
1621
1622
1623
1624
1625static void posix_cpu_timers_init_group(struct signal_struct *sig)
1626{
1627 struct posix_cputimers *pct = &sig->posix_cputimers;
1628 unsigned long cpu_limit;
1629
1630 cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
1631 posix_cputimers_group_init(pct, cpu_limit);
1632}
1633
1634static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1635{
1636 struct signal_struct *sig;
1637
1638 if (clone_flags & CLONE_THREAD)
1639 return 0;
1640
1641 sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
1642 tsk->signal = sig;
1643 if (!sig)
1644 return -ENOMEM;
1645
1646 sig->nr_threads = 1;
1647 atomic_set(&sig->live, 1);
1648 refcount_set(&sig->sigcnt, 1);
1649
1650
1651 sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
1652 tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head);
1653
1654 init_waitqueue_head(&sig->wait_chldexit);
1655 sig->curr_target = tsk;
1656 init_sigpending(&sig->shared_pending);
1657 INIT_HLIST_HEAD(&sig->multiprocess);
1658 seqlock_init(&sig->stats_lock);
1659 prev_cputime_init(&sig->prev_cputime);
1660
1661#ifdef CONFIG_POSIX_TIMERS
1662 INIT_LIST_HEAD(&sig->posix_timers);
1663 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1664 sig->real_timer.function = it_real_fn;
1665#endif
1666
1667 task_lock(current->group_leader);
1668 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
1669 task_unlock(current->group_leader);
1670
1671 posix_cpu_timers_init_group(sig);
1672
1673 tty_audit_fork(sig);
1674 sched_autogroup_fork(sig);
1675
1676 sig->oom_score_adj = current->signal->oom_score_adj;
1677 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
1678
1679 mutex_init(&sig->cred_guard_mutex);
1680 init_rwsem(&sig->exec_update_lock);
1681
1682 return 0;
1683}
1684
1685static void copy_seccomp(struct task_struct *p)
1686{
1687#ifdef CONFIG_SECCOMP
1688
1689
1690
1691
1692
1693
1694 assert_spin_locked(¤t->sighand->siglock);
1695
1696
1697 get_seccomp_filter(current);
1698 p->seccomp = current->seccomp;
1699
1700
1701
1702
1703
1704
1705 if (task_no_new_privs(current))
1706 task_set_no_new_privs(p);
1707
1708
1709
1710
1711
1712
1713 if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
1714 set_task_syscall_work(p, SECCOMP);
1715#endif
1716}
1717
1718SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
1719{
1720 current->clear_child_tid = tidptr;
1721
1722 return task_pid_vnr(current);
1723}
1724
1725static void rt_mutex_init_task(struct task_struct *p)
1726{
1727 raw_spin_lock_init(&p->pi_lock);
1728#ifdef CONFIG_RT_MUTEXES
1729 p->pi_waiters = RB_ROOT_CACHED;
1730 p->pi_top_task = NULL;
1731 p->pi_blocked_on = NULL;
1732#endif
1733}
1734
1735static inline void init_task_pid_links(struct task_struct *task)
1736{
1737 enum pid_type type;
1738
1739 for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type)
1740 INIT_HLIST_NODE(&task->pid_links[type]);
1741}
1742
1743static inline void
1744init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
1745{
1746 if (type == PIDTYPE_PID)
1747 task->thread_pid = pid;
1748 else
1749 task->signal->pids[type] = pid;
1750}
1751
1752static inline void rcu_copy_process(struct task_struct *p)
1753{
1754#ifdef CONFIG_PREEMPT_RCU
1755 p->rcu_read_lock_nesting = 0;
1756 p->rcu_read_unlock_special.s = 0;
1757 p->rcu_blocked_node = NULL;
1758 INIT_LIST_HEAD(&p->rcu_node_entry);
1759#endif
1760#ifdef CONFIG_TASKS_RCU
1761 p->rcu_tasks_holdout = false;
1762 INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
1763 p->rcu_tasks_idle_cpu = -1;
1764#endif
1765#ifdef CONFIG_TASKS_TRACE_RCU
1766 p->trc_reader_nesting = 0;
1767 p->trc_reader_special.s = 0;
1768 INIT_LIST_HEAD(&p->trc_holdout_list);
1769#endif
1770}
1771
1772struct pid *pidfd_pid(const struct file *file)
1773{
1774 if (file->f_op == &pidfd_fops)
1775 return file->private_data;
1776
1777 return ERR_PTR(-EBADF);
1778}
1779
1780static int pidfd_release(struct inode *inode, struct file *file)
1781{
1782 struct pid *pid = file->private_data;
1783
1784 file->private_data = NULL;
1785 put_pid(pid);
1786 return 0;
1787}
1788
1789#ifdef CONFIG_PROC_FS
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
1826{
1827 struct pid *pid = f->private_data;
1828 struct pid_namespace *ns;
1829 pid_t nr = -1;
1830
1831 if (likely(pid_has_task(pid, PIDTYPE_PID))) {
1832 ns = proc_pid_ns(file_inode(m->file)->i_sb);
1833 nr = pid_nr_ns(pid, ns);
1834 }
1835
1836 seq_put_decimal_ll(m, "Pid:\t", nr);
1837
1838#ifdef CONFIG_PID_NS
1839 seq_put_decimal_ll(m, "\nNSpid:\t", nr);
1840 if (nr > 0) {
1841 int i;
1842
1843
1844
1845
1846
1847
1848 for (i = ns->level + 1; i <= pid->level; i++)
1849 seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
1850 }
1851#endif
1852 seq_putc(m, '\n');
1853}
1854#endif
1855
1856
1857
1858
1859static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
1860{
1861 struct pid *pid = file->private_data;
1862 __poll_t poll_flags = 0;
1863
1864 poll_wait(file, &pid->wait_pidfd, pts);
1865
1866
1867
1868
1869
1870
1871 if (thread_group_exited(pid))
1872 poll_flags = EPOLLIN | EPOLLRDNORM;
1873
1874 return poll_flags;
1875}
1876
1877const struct file_operations pidfd_fops = {
1878 .release = pidfd_release,
1879 .poll = pidfd_poll,
1880#ifdef CONFIG_PROC_FS
1881 .show_fdinfo = pidfd_show_fdinfo,
1882#endif
1883};
1884
1885static void __delayed_free_task(struct rcu_head *rhp)
1886{
1887 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
1888
1889 free_task(tsk);
1890}
1891
1892static __always_inline void delayed_free_task(struct task_struct *tsk)
1893{
1894 if (IS_ENABLED(CONFIG_MEMCG))
1895 call_rcu(&tsk->rcu, __delayed_free_task);
1896 else
1897 free_task(tsk);
1898}
1899
1900static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
1901{
1902
1903 if (!tsk->mm)
1904 return;
1905
1906
1907 if ((clone_flags & (CLONE_VM | CLONE_THREAD | CLONE_VFORK)) != CLONE_VM)
1908 return;
1909
1910
1911 mutex_lock(&oom_adj_mutex);
1912 set_bit(MMF_MULTIPROCESS, &tsk->mm->flags);
1913
1914 tsk->signal->oom_score_adj = current->signal->oom_score_adj;
1915 tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min;
1916 mutex_unlock(&oom_adj_mutex);
1917}
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927static __latent_entropy struct task_struct *copy_process(
1928 struct pid *pid,
1929 int trace,
1930 int node,
1931 struct kernel_clone_args *args)
1932{
1933 int pidfd = -1, retval;
1934 struct task_struct *p;
1935 struct multiprocess_signals delayed;
1936 struct file *pidfile = NULL;
1937 u64 clone_flags = args->flags;
1938 struct nsproxy *nsp = current->nsproxy;
1939
1940
1941
1942
1943
1944 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
1945 return ERR_PTR(-EINVAL);
1946
1947 if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
1948 return ERR_PTR(-EINVAL);
1949
1950
1951
1952
1953
1954 if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
1955 return ERR_PTR(-EINVAL);
1956
1957
1958
1959
1960
1961
1962 if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
1963 return ERR_PTR(-EINVAL);
1964
1965
1966
1967
1968
1969
1970
1971 if ((clone_flags & CLONE_PARENT) &&
1972 current->signal->flags & SIGNAL_UNKILLABLE)
1973 return ERR_PTR(-EINVAL);
1974
1975
1976
1977
1978
1979 if (clone_flags & CLONE_THREAD) {
1980 if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
1981 (task_active_pid_ns(current) != nsp->pid_ns_for_children))
1982 return ERR_PTR(-EINVAL);
1983 }
1984
1985
1986
1987
1988
1989 if (clone_flags & (CLONE_THREAD | CLONE_VM)) {
1990 if (nsp->time_ns != nsp->time_ns_for_children)
1991 return ERR_PTR(-EINVAL);
1992 }
1993
1994 if (clone_flags & CLONE_PIDFD) {
1995
1996
1997
1998
1999
2000 if (clone_flags & (CLONE_DETACHED | CLONE_THREAD))
2001 return ERR_PTR(-EINVAL);
2002 }
2003
2004
2005
2006
2007
2008
2009
2010 sigemptyset(&delayed.signal);
2011 INIT_HLIST_NODE(&delayed.node);
2012
2013 spin_lock_irq(¤t->sighand->siglock);
2014 if (!(clone_flags & CLONE_THREAD))
2015 hlist_add_head(&delayed.node, ¤t->signal->multiprocess);
2016 recalc_sigpending();
2017 spin_unlock_irq(¤t->sighand->siglock);
2018 retval = -ERESTARTNOINTR;
2019 if (task_sigpending(current))
2020 goto fork_out;
2021
2022 retval = -ENOMEM;
2023 p = dup_task_struct(current, node);
2024 if (!p)
2025 goto fork_out;
2026 if (args->io_thread) {
2027
2028
2029
2030
2031 p->flags |= PF_IO_WORKER;
2032 siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
2033 }
2034
2035
2036
2037
2038
2039
2040
2041 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
2042
2043
2044
2045 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;
2046
2047 ftrace_graph_init_task(p);
2048
2049 rt_mutex_init_task(p);
2050
2051 lockdep_assert_irqs_enabled();
2052#ifdef CONFIG_PROVE_LOCKING
2053 DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
2054#endif
2055 retval = -EAGAIN;
2056 if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
2057 if (p->real_cred->user != INIT_USER &&
2058 !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
2059 goto bad_fork_free;
2060 }
2061 current->flags &= ~PF_NPROC_EXCEEDED;
2062
2063 retval = copy_creds(p, clone_flags);
2064 if (retval < 0)
2065 goto bad_fork_free;
2066
2067
2068
2069
2070
2071
2072 retval = -EAGAIN;
2073 if (data_race(nr_threads >= max_threads))
2074 goto bad_fork_cleanup_count;
2075
2076 delayacct_tsk_init(p);
2077 p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE | PF_NO_SETAFFINITY);
2078 p->flags |= PF_FORKNOEXEC;
2079 INIT_LIST_HEAD(&p->children);
2080 INIT_LIST_HEAD(&p->sibling);
2081 rcu_copy_process(p);
2082 p->vfork_done = NULL;
2083 spin_lock_init(&p->alloc_lock);
2084
2085 init_sigpending(&p->pending);
2086
2087 p->utime = p->stime = p->gtime = 0;
2088#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
2089 p->utimescaled = p->stimescaled = 0;
2090#endif
2091 prev_cputime_init(&p->prev_cputime);
2092
2093#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
2094 seqcount_init(&p->vtime.seqcount);
2095 p->vtime.starttime = 0;
2096 p->vtime.state = VTIME_INACTIVE;
2097#endif
2098
2099#ifdef CONFIG_IO_URING
2100 p->io_uring = NULL;
2101#endif
2102
2103#if defined(SPLIT_RSS_COUNTING)
2104 memset(&p->rss_stat, 0, sizeof(p->rss_stat));
2105#endif
2106
2107 p->default_timer_slack_ns = current->timer_slack_ns;
2108
2109#ifdef CONFIG_PSI
2110 p->psi_flags = 0;
2111#endif
2112
2113 task_io_accounting_init(&p->ioac);
2114 acct_clear_integrals(p);
2115
2116 posix_cputimers_init(&p->posix_cputimers);
2117
2118 p->io_context = NULL;
2119 audit_set_context(p, NULL);
2120 cgroup_fork(p);
2121#ifdef CONFIG_NUMA
2122 p->mempolicy = mpol_dup(p->mempolicy);
2123 if (IS_ERR(p->mempolicy)) {
2124 retval = PTR_ERR(p->mempolicy);
2125 p->mempolicy = NULL;
2126 goto bad_fork_cleanup_threadgroup_lock;
2127 }
2128#endif
2129#ifdef CONFIG_CPUSETS
2130 p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
2131 p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
2132 seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
2133#endif
2134#ifdef CONFIG_TRACE_IRQFLAGS
2135 memset(&p->irqtrace, 0, sizeof(p->irqtrace));
2136 p->irqtrace.hardirq_disable_ip = _THIS_IP_;
2137 p->irqtrace.softirq_enable_ip = _THIS_IP_;
2138 p->softirqs_enabled = 1;
2139 p->softirq_context = 0;
2140#endif
2141
2142 p->pagefault_disabled = 0;
2143
2144#ifdef CONFIG_LOCKDEP
2145 lockdep_init_task(p);
2146#endif
2147
2148#ifdef CONFIG_DEBUG_MUTEXES
2149 p->blocked_on = NULL;
2150#endif
2151#ifdef CONFIG_BCACHE
2152 p->sequential_io = 0;
2153 p->sequential_io_avg = 0;
2154#endif
2155#ifdef CONFIG_BPF_SYSCALL
2156 RCU_INIT_POINTER(p->bpf_storage, NULL);
2157 p->bpf_ctx = NULL;
2158#endif
2159
2160
2161 retval = sched_fork(clone_flags, p);
2162 if (retval)
2163 goto bad_fork_cleanup_policy;
2164
2165 retval = perf_event_init_task(p, clone_flags);
2166 if (retval)
2167 goto bad_fork_cleanup_policy;
2168 retval = audit_alloc(p);
2169 if (retval)
2170 goto bad_fork_cleanup_perf;
2171
2172 shm_init_task(p);
2173 retval = security_task_alloc(p, clone_flags);
2174 if (retval)
2175 goto bad_fork_cleanup_audit;
2176 retval = copy_semundo(clone_flags, p);
2177 if (retval)
2178 goto bad_fork_cleanup_security;
2179 retval = copy_files(clone_flags, p);
2180 if (retval)
2181 goto bad_fork_cleanup_semundo;
2182 retval = copy_fs(clone_flags, p);
2183 if (retval)
2184 goto bad_fork_cleanup_files;
2185 retval = copy_sighand(clone_flags, p);
2186 if (retval)
2187 goto bad_fork_cleanup_fs;
2188 retval = copy_signal(clone_flags, p);
2189 if (retval)
2190 goto bad_fork_cleanup_sighand;
2191 retval = copy_mm(clone_flags, p);
2192 if (retval)
2193 goto bad_fork_cleanup_signal;
2194 retval = copy_namespaces(clone_flags, p);
2195 if (retval)
2196 goto bad_fork_cleanup_mm;
2197 retval = copy_io(clone_flags, p);
2198 if (retval)
2199 goto bad_fork_cleanup_namespaces;
2200 retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls);
2201 if (retval)
2202 goto bad_fork_cleanup_io;
2203
2204 stackleak_task_init(p);
2205
2206 if (pid != &init_struct_pid) {
2207 pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
2208 args->set_tid_size);
2209 if (IS_ERR(pid)) {
2210 retval = PTR_ERR(pid);
2211 goto bad_fork_cleanup_thread;
2212 }
2213 }
2214
2215
2216
2217
2218
2219
2220 if (clone_flags & CLONE_PIDFD) {
2221 retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
2222 if (retval < 0)
2223 goto bad_fork_free_pid;
2224
2225 pidfd = retval;
2226
2227 pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
2228 O_RDWR | O_CLOEXEC);
2229 if (IS_ERR(pidfile)) {
2230 put_unused_fd(pidfd);
2231 retval = PTR_ERR(pidfile);
2232 goto bad_fork_free_pid;
2233 }
2234 get_pid(pid);
2235
2236 retval = put_user(pidfd, args->pidfd);
2237 if (retval)
2238 goto bad_fork_put_pidfd;
2239 }
2240
2241#ifdef CONFIG_BLOCK
2242 p->plug = NULL;
2243#endif
2244 futex_init_task(p);
2245
2246
2247
2248
2249 if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
2250 sas_ss_reset(p);
2251
2252
2253
2254
2255
2256 user_disable_single_step(p);
2257 clear_task_syscall_work(p, SYSCALL_TRACE);
2258#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
2259 clear_task_syscall_work(p, SYSCALL_EMU);
2260#endif
2261 clear_tsk_latency_tracing(p);
2262
2263
2264 p->pid = pid_nr(pid);
2265 if (clone_flags & CLONE_THREAD) {
2266 p->group_leader = current->group_leader;
2267 p->tgid = current->tgid;
2268 } else {
2269 p->group_leader = p;
2270 p->tgid = p->pid;
2271 }
2272
2273 p->nr_dirtied = 0;
2274 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
2275 p->dirty_paused_when = 0;
2276
2277 p->pdeath_signal = 0;
2278 INIT_LIST_HEAD(&p->thread_group);
2279 p->task_works = NULL;
2280 clear_posix_cputimers_work(p);
2281
2282#ifdef CONFIG_KRETPROBES
2283 p->kretprobe_instances.first = NULL;
2284#endif
2285
2286
2287
2288
2289
2290
2291
2292 retval = cgroup_can_fork(p, args);
2293 if (retval)
2294 goto bad_fork_put_pidfd;
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304 p->start_time = ktime_get_ns();
2305 p->start_boottime = ktime_get_boottime_ns();
2306
2307
2308
2309
2310
2311 write_lock_irq(&tasklist_lock);
2312
2313
2314 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
2315 p->real_parent = current->real_parent;
2316 p->parent_exec_id = current->parent_exec_id;
2317 if (clone_flags & CLONE_THREAD)
2318 p->exit_signal = -1;
2319 else
2320 p->exit_signal = current->group_leader->exit_signal;
2321 } else {
2322 p->real_parent = current;
2323 p->parent_exec_id = current->self_exec_id;
2324 p->exit_signal = args->exit_signal;
2325 }
2326
2327 klp_copy_process(p);
2328
2329 sched_core_fork(p);
2330
2331 spin_lock(¤t->sighand->siglock);
2332
2333
2334
2335
2336
2337 copy_seccomp(p);
2338
2339 rseq_fork(p, clone_flags);
2340
2341
2342 if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
2343 retval = -ENOMEM;
2344 goto bad_fork_cancel_cgroup;
2345 }
2346
2347
2348 if (fatal_signal_pending(current)) {
2349 retval = -EINTR;
2350 goto bad_fork_cancel_cgroup;
2351 }
2352
2353
2354 if (pidfile)
2355 fd_install(pidfd, pidfile);
2356
2357 init_task_pid_links(p);
2358 if (likely(p->pid)) {
2359 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
2360
2361 init_task_pid(p, PIDTYPE_PID, pid);
2362 if (thread_group_leader(p)) {
2363 init_task_pid(p, PIDTYPE_TGID, pid);
2364 init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
2365 init_task_pid(p, PIDTYPE_SID, task_session(current));
2366
2367 if (is_child_reaper(pid)) {
2368 ns_of_pid(pid)->child_reaper = p;
2369 p->signal->flags |= SIGNAL_UNKILLABLE;
2370 }
2371 p->signal->shared_pending.signal = delayed.signal;
2372 p->signal->tty = tty_kref_get(current->signal->tty);
2373
2374
2375
2376
2377
2378 p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
2379 p->real_parent->signal->is_child_subreaper;
2380 list_add_tail(&p->sibling, &p->real_parent->children);
2381 list_add_tail_rcu(&p->tasks, &init_task.tasks);
2382 attach_pid(p, PIDTYPE_TGID);
2383 attach_pid(p, PIDTYPE_PGID);
2384 attach_pid(p, PIDTYPE_SID);
2385 __this_cpu_inc(process_counts);
2386 } else {
2387 current->signal->nr_threads++;
2388 atomic_inc(¤t->signal->live);
2389 refcount_inc(¤t->signal->sigcnt);
2390 task_join_group_stop(p);
2391 list_add_tail_rcu(&p->thread_group,
2392 &p->group_leader->thread_group);
2393 list_add_tail_rcu(&p->thread_node,
2394 &p->signal->thread_head);
2395 }
2396 attach_pid(p, PIDTYPE_PID);
2397 nr_threads++;
2398 }
2399 total_forks++;
2400 hlist_del_init(&delayed.node);
2401 spin_unlock(¤t->sighand->siglock);
2402 syscall_tracepoint_update(p);
2403 write_unlock_irq(&tasklist_lock);
2404
2405 proc_fork_connector(p);
2406 sched_post_fork(p, args);
2407 cgroup_post_fork(p, args);
2408 perf_event_fork(p);
2409
2410 trace_task_newtask(p, clone_flags);
2411 uprobe_copy_process(p, clone_flags);
2412
2413 copy_oom_score_adj(clone_flags, p);
2414
2415 return p;
2416
2417bad_fork_cancel_cgroup:
2418 sched_core_free(p);
2419 spin_unlock(¤t->sighand->siglock);
2420 write_unlock_irq(&tasklist_lock);
2421 cgroup_cancel_fork(p, args);
2422bad_fork_put_pidfd:
2423 if (clone_flags & CLONE_PIDFD) {
2424 fput(pidfile);
2425 put_unused_fd(pidfd);
2426 }
2427bad_fork_free_pid:
2428 if (pid != &init_struct_pid)
2429 free_pid(pid);
2430bad_fork_cleanup_thread:
2431 exit_thread(p);
2432bad_fork_cleanup_io:
2433 if (p->io_context)
2434 exit_io_context(p);
2435bad_fork_cleanup_namespaces:
2436 exit_task_namespaces(p);
2437bad_fork_cleanup_mm:
2438 if (p->mm) {
2439 mm_clear_owner(p->mm, p);
2440 mmput(p->mm);
2441 }
2442bad_fork_cleanup_signal:
2443 if (!(clone_flags & CLONE_THREAD))
2444 free_signal_struct(p->signal);
2445bad_fork_cleanup_sighand:
2446 __cleanup_sighand(p->sighand);
2447bad_fork_cleanup_fs:
2448 exit_fs(p);
2449bad_fork_cleanup_files:
2450 exit_files(p);
2451bad_fork_cleanup_semundo:
2452 exit_sem(p);
2453bad_fork_cleanup_security:
2454 security_task_free(p);
2455bad_fork_cleanup_audit:
2456 audit_free(p);
2457bad_fork_cleanup_perf:
2458 perf_event_free_task(p);
2459bad_fork_cleanup_policy:
2460 lockdep_free_task(p);
2461#ifdef CONFIG_NUMA
2462 mpol_put(p->mempolicy);
2463bad_fork_cleanup_threadgroup_lock:
2464#endif
2465 delayacct_tsk_free(p);
2466bad_fork_cleanup_count:
2467 dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
2468 exit_creds(p);
2469bad_fork_free:
2470 WRITE_ONCE(p->__state, TASK_DEAD);
2471 put_task_stack(p);
2472 delayed_free_task(p);
2473fork_out:
2474 spin_lock_irq(¤t->sighand->siglock);
2475 hlist_del_init(&delayed.node);
2476 spin_unlock_irq(¤t->sighand->siglock);
2477 return ERR_PTR(retval);
2478}
2479
2480static inline void init_idle_pids(struct task_struct *idle)
2481{
2482 enum pid_type type;
2483
2484 for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
2485 INIT_HLIST_NODE(&idle->pid_links[type]);
2486 init_task_pid(idle, type, &init_struct_pid);
2487 }
2488}
2489
2490struct task_struct * __init fork_idle(int cpu)
2491{
2492 struct task_struct *task;
2493 struct kernel_clone_args args = {
2494 .flags = CLONE_VM,
2495 };
2496
2497 task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
2498 if (!IS_ERR(task)) {
2499 init_idle_pids(task);
2500 init_idle(task, cpu);
2501 }
2502
2503 return task;
2504}
2505
2506struct mm_struct *copy_init_mm(void)
2507{
2508 return dup_mm(NULL, &init_mm);
2509}
2510
2511
2512
2513
2514
2515
2516
2517struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
2518{
2519 unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
2520 CLONE_IO;
2521 struct kernel_clone_args args = {
2522 .flags = ((lower_32_bits(flags) | CLONE_VM |
2523 CLONE_UNTRACED) & ~CSIGNAL),
2524 .exit_signal = (lower_32_bits(flags) & CSIGNAL),
2525 .stack = (unsigned long)fn,
2526 .stack_size = (unsigned long)arg,
2527 .io_thread = 1,
2528 };
2529
2530 return copy_process(NULL, 0, node, &args);
2531}
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541pid_t kernel_clone(struct kernel_clone_args *args)
2542{
2543 u64 clone_flags = args->flags;
2544 struct completion vfork;
2545 struct pid *pid;
2546 struct task_struct *p;
2547 int trace = 0;
2548 pid_t nr;
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559 if ((args->flags & CLONE_PIDFD) &&
2560 (args->flags & CLONE_PARENT_SETTID) &&
2561 (args->pidfd == args->parent_tid))
2562 return -EINVAL;
2563
2564
2565
2566
2567
2568
2569
2570 if (!(clone_flags & CLONE_UNTRACED)) {
2571 if (clone_flags & CLONE_VFORK)
2572 trace = PTRACE_EVENT_VFORK;
2573 else if (args->exit_signal != SIGCHLD)
2574 trace = PTRACE_EVENT_CLONE;
2575 else
2576 trace = PTRACE_EVENT_FORK;
2577
2578 if (likely(!ptrace_event_enabled(current, trace)))
2579 trace = 0;
2580 }
2581
2582 p = copy_process(NULL, trace, NUMA_NO_NODE, args);
2583 add_latent_entropy();
2584
2585 if (IS_ERR(p))
2586 return PTR_ERR(p);
2587
2588
2589
2590
2591
2592 trace_sched_process_fork(current, p);
2593
2594 pid = get_task_pid(p, PIDTYPE_PID);
2595 nr = pid_vnr(pid);
2596
2597 if (clone_flags & CLONE_PARENT_SETTID)
2598 put_user(nr, args->parent_tid);
2599
2600 if (clone_flags & CLONE_VFORK) {
2601 p->vfork_done = &vfork;
2602 init_completion(&vfork);
2603 get_task_struct(p);
2604 }
2605
2606 wake_up_new_task(p);
2607
2608
2609 if (unlikely(trace))
2610 ptrace_event_pid(trace, pid);
2611
2612 if (clone_flags & CLONE_VFORK) {
2613 if (!wait_for_vfork_done(p, &vfork))
2614 ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
2615 }
2616
2617 put_pid(pid);
2618 return nr;
2619}
2620
2621
2622
2623
2624pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
2625{
2626 struct kernel_clone_args args = {
2627 .flags = ((lower_32_bits(flags) | CLONE_VM |
2628 CLONE_UNTRACED) & ~CSIGNAL),
2629 .exit_signal = (lower_32_bits(flags) & CSIGNAL),
2630 .stack = (unsigned long)fn,
2631 .stack_size = (unsigned long)arg,
2632 };
2633
2634 return kernel_clone(&args);
2635}
2636
2637#ifdef __ARCH_WANT_SYS_FORK
2638SYSCALL_DEFINE0(fork)
2639{
2640#ifdef CONFIG_MMU
2641 struct kernel_clone_args args = {
2642 .exit_signal = SIGCHLD,
2643 };
2644
2645 return kernel_clone(&args);
2646#else
2647
2648 return -EINVAL;
2649#endif
2650}
2651#endif
2652
2653#ifdef __ARCH_WANT_SYS_VFORK
2654SYSCALL_DEFINE0(vfork)
2655{
2656 struct kernel_clone_args args = {
2657 .flags = CLONE_VFORK | CLONE_VM,
2658 .exit_signal = SIGCHLD,
2659 };
2660
2661 return kernel_clone(&args);
2662}
2663#endif
2664
2665#ifdef __ARCH_WANT_SYS_CLONE
2666#ifdef CONFIG_CLONE_BACKWARDS
2667SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
2668 int __user *, parent_tidptr,
2669 unsigned long, tls,
2670 int __user *, child_tidptr)
2671#elif defined(CONFIG_CLONE_BACKWARDS2)
2672SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
2673 int __user *, parent_tidptr,
2674 int __user *, child_tidptr,
2675 unsigned long, tls)
2676#elif defined(CONFIG_CLONE_BACKWARDS3)
2677SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
2678 int, stack_size,
2679 int __user *, parent_tidptr,
2680 int __user *, child_tidptr,
2681 unsigned long, tls)
2682#else
2683SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
2684 int __user *, parent_tidptr,
2685 int __user *, child_tidptr,
2686 unsigned long, tls)
2687#endif
2688{
2689 struct kernel_clone_args args = {
2690 .flags = (lower_32_bits(clone_flags) & ~CSIGNAL),
2691 .pidfd = parent_tidptr,
2692 .child_tid = child_tidptr,
2693 .parent_tid = parent_tidptr,
2694 .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL),
2695 .stack = newsp,
2696 .tls = tls,
2697 };
2698
2699 return kernel_clone(&args);
2700}
2701#endif
2702
2703#ifdef __ARCH_WANT_SYS_CLONE3
2704
2705noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
2706 struct clone_args __user *uargs,
2707 size_t usize)
2708{
2709 int err;
2710 struct clone_args args;
2711 pid_t *kset_tid = kargs->set_tid;
2712
2713 BUILD_BUG_ON(offsetofend(struct clone_args, tls) !=
2714 CLONE_ARGS_SIZE_VER0);
2715 BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) !=
2716 CLONE_ARGS_SIZE_VER1);
2717 BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
2718 CLONE_ARGS_SIZE_VER2);
2719 BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);
2720
2721 if (unlikely(usize > PAGE_SIZE))
2722 return -E2BIG;
2723 if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
2724 return -EINVAL;
2725
2726 err = copy_struct_from_user(&args, sizeof(args), uargs, usize);
2727 if (err)
2728 return err;
2729
2730 if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
2731 return -EINVAL;
2732
2733 if (unlikely(!args.set_tid && args.set_tid_size > 0))
2734 return -EINVAL;
2735
2736 if (unlikely(args.set_tid && args.set_tid_size == 0))
2737 return -EINVAL;
2738
2739
2740
2741
2742
2743 if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
2744 !valid_signal(args.exit_signal)))
2745 return -EINVAL;
2746
2747 if ((args.flags & CLONE_INTO_CGROUP) &&
2748 (args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2))
2749 return -EINVAL;
2750
2751 *kargs = (struct kernel_clone_args){
2752 .flags = args.flags,
2753 .pidfd = u64_to_user_ptr(args.pidfd),
2754 .child_tid = u64_to_user_ptr(args.child_tid),
2755 .parent_tid = u64_to_user_ptr(args.parent_tid),
2756 .exit_signal = args.exit_signal,
2757 .stack = args.stack,
2758 .stack_size = args.stack_size,
2759 .tls = args.tls,
2760 .set_tid_size = args.set_tid_size,
2761 .cgroup = args.cgroup,
2762 };
2763
2764 if (args.set_tid &&
2765 copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
2766 (kargs->set_tid_size * sizeof(pid_t))))
2767 return -EFAULT;
2768
2769 kargs->set_tid = kset_tid;
2770
2771 return 0;
2772}
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
2783{
2784 if (kargs->stack == 0) {
2785 if (kargs->stack_size > 0)
2786 return false;
2787 } else {
2788 if (kargs->stack_size == 0)
2789 return false;
2790
2791 if (!access_ok((void __user *)kargs->stack, kargs->stack_size))
2792 return false;
2793
2794#if !defined(CONFIG_STACK_GROWSUP) && !defined(CONFIG_IA64)
2795 kargs->stack += kargs->stack_size;
2796#endif
2797 }
2798
2799 return true;
2800}
2801
2802static bool clone3_args_valid(struct kernel_clone_args *kargs)
2803{
2804
2805 if (kargs->flags &
2806 ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
2807 return false;
2808
2809
2810
2811
2812
2813 if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
2814 return false;
2815
2816 if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) ==
2817 (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND))
2818 return false;
2819
2820 if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
2821 kargs->exit_signal)
2822 return false;
2823
2824 if (!clone3_stack_valid(kargs))
2825 return false;
2826
2827 return true;
2828}
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
2842{
2843 int err;
2844
2845 struct kernel_clone_args kargs;
2846 pid_t set_tid[MAX_PID_NS_LEVEL];
2847
2848 kargs.set_tid = set_tid;
2849
2850 err = copy_clone_args_from_user(&kargs, uargs, size);
2851 if (err)
2852 return err;
2853
2854 if (!clone3_args_valid(&kargs))
2855 return -EINVAL;
2856
2857 return kernel_clone(&kargs);
2858}
2859#endif
2860
2861void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data)
2862{
2863 struct task_struct *leader, *parent, *child;
2864 int res;
2865
2866 read_lock(&tasklist_lock);
2867 leader = top = top->group_leader;
2868down:
2869 for_each_thread(leader, parent) {
2870 list_for_each_entry(child, &parent->children, sibling) {
2871 res = visitor(child, data);
2872 if (res) {
2873 if (res < 0)
2874 goto out;
2875 leader = child;
2876 goto down;
2877 }
2878up:
2879 ;
2880 }
2881 }
2882
2883 if (leader != top) {
2884 child = leader;
2885 parent = child->real_parent;
2886 leader = parent->group_leader;
2887 goto up;
2888 }
2889out:
2890 read_unlock(&tasklist_lock);
2891}
2892
2893#ifndef ARCH_MIN_MMSTRUCT_ALIGN
2894#define ARCH_MIN_MMSTRUCT_ALIGN 0
2895#endif
2896
2897static void sighand_ctor(void *data)
2898{
2899 struct sighand_struct *sighand = data;
2900
2901 spin_lock_init(&sighand->siglock);
2902 init_waitqueue_head(&sighand->signalfd_wqh);
2903}
2904
2905void __init proc_caches_init(void)
2906{
2907 unsigned int mm_size;
2908
2909 sighand_cachep = kmem_cache_create("sighand_cache",
2910 sizeof(struct sighand_struct), 0,
2911 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
2912 SLAB_ACCOUNT, sighand_ctor);
2913 signal_cachep = kmem_cache_create("signal_cache",
2914 sizeof(struct signal_struct), 0,
2915 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
2916 NULL);
2917 files_cachep = kmem_cache_create("files_cache",
2918 sizeof(struct files_struct), 0,
2919 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
2920 NULL);
2921 fs_cachep = kmem_cache_create("fs_cache",
2922 sizeof(struct fs_struct), 0,
2923 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
2924 NULL);
2925
2926
2927
2928
2929
2930
2931 mm_size = sizeof(struct mm_struct) + cpumask_size();
2932
2933 mm_cachep = kmem_cache_create_usercopy("mm_struct",
2934 mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
2935 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
2936 offsetof(struct mm_struct, saved_auxv),
2937 sizeof_field(struct mm_struct, saved_auxv),
2938 NULL);
2939 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
2940 mmap_init();
2941 nsproxy_cache_init();
2942}
2943
2944
2945
2946
2947static int check_unshare_flags(unsigned long unshare_flags)
2948{
2949 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
2950 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
2951 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
2952 CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
2953 CLONE_NEWTIME))
2954 return -EINVAL;
2955
2956
2957
2958
2959
2960
2961 if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
2962 if (!thread_group_empty(current))
2963 return -EINVAL;
2964 }
2965 if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
2966 if (refcount_read(¤t->sighand->count) > 1)
2967 return -EINVAL;
2968 }
2969 if (unshare_flags & CLONE_VM) {
2970 if (!current_is_single_threaded())
2971 return -EINVAL;
2972 }
2973
2974 return 0;
2975}
2976
2977
2978
2979
2980static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
2981{
2982 struct fs_struct *fs = current->fs;
2983
2984 if (!(unshare_flags & CLONE_FS) || !fs)
2985 return 0;
2986
2987
2988 if (fs->users == 1)
2989 return 0;
2990
2991 *new_fsp = copy_fs_struct(fs);
2992 if (!*new_fsp)
2993 return -ENOMEM;
2994
2995 return 0;
2996}
2997
2998
2999
3000
3001int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
3002 struct files_struct **new_fdp)
3003{
3004 struct files_struct *fd = current->files;
3005 int error = 0;
3006
3007 if ((unshare_flags & CLONE_FILES) &&
3008 (fd && atomic_read(&fd->count) > 1)) {
3009 *new_fdp = dup_fd(fd, max_fds, &error);
3010 if (!*new_fdp)
3011 return error;
3012 }
3013
3014 return 0;
3015}
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025int ksys_unshare(unsigned long unshare_flags)
3026{
3027 struct fs_struct *fs, *new_fs = NULL;
3028 struct files_struct *new_fd = NULL;
3029 struct cred *new_cred = NULL;
3030 struct nsproxy *new_nsproxy = NULL;
3031 int do_sysvsem = 0;
3032 int err;
3033
3034
3035
3036
3037
3038 if (unshare_flags & CLONE_NEWUSER)
3039 unshare_flags |= CLONE_THREAD | CLONE_FS;
3040
3041
3042
3043 if (unshare_flags & CLONE_VM)
3044 unshare_flags |= CLONE_SIGHAND;
3045
3046
3047
3048 if (unshare_flags & CLONE_SIGHAND)
3049 unshare_flags |= CLONE_THREAD;
3050
3051
3052
3053 if (unshare_flags & CLONE_NEWNS)
3054 unshare_flags |= CLONE_FS;
3055
3056 err = check_unshare_flags(unshare_flags);
3057 if (err)
3058 goto bad_unshare_out;
3059
3060
3061
3062
3063
3064 if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
3065 do_sysvsem = 1;
3066 err = unshare_fs(unshare_flags, &new_fs);
3067 if (err)
3068 goto bad_unshare_out;
3069 err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd);
3070 if (err)
3071 goto bad_unshare_cleanup_fs;
3072 err = unshare_userns(unshare_flags, &new_cred);
3073 if (err)
3074 goto bad_unshare_cleanup_fd;
3075 err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
3076 new_cred, new_fs);
3077 if (err)
3078 goto bad_unshare_cleanup_cred;
3079
3080 if (new_cred) {
3081 err = set_cred_ucounts(new_cred);
3082 if (err)
3083 goto bad_unshare_cleanup_cred;
3084 }
3085
3086 if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
3087 if (do_sysvsem) {
3088
3089
3090
3091 exit_sem(current);
3092 }
3093 if (unshare_flags & CLONE_NEWIPC) {
3094
3095 exit_shm(current);
3096 shm_init_task(current);
3097 }
3098
3099 if (new_nsproxy)
3100 switch_task_namespaces(current, new_nsproxy);
3101
3102 task_lock(current);
3103
3104 if (new_fs) {
3105 fs = current->fs;
3106 spin_lock(&fs->lock);
3107 current->fs = new_fs;
3108 if (--fs->users)
3109 new_fs = NULL;
3110 else
3111 new_fs = fs;
3112 spin_unlock(&fs->lock);
3113 }
3114
3115 if (new_fd)
3116 swap(current->files, new_fd);
3117
3118 task_unlock(current);
3119
3120 if (new_cred) {
3121
3122 commit_creds(new_cred);
3123 new_cred = NULL;
3124 }
3125 }
3126
3127 perf_event_namespaces(current);
3128
3129bad_unshare_cleanup_cred:
3130 if (new_cred)
3131 put_cred(new_cred);
3132bad_unshare_cleanup_fd:
3133 if (new_fd)
3134 put_files_struct(new_fd);
3135
3136bad_unshare_cleanup_fs:
3137 if (new_fs)
3138 free_fs_struct(new_fs);
3139
3140bad_unshare_out:
3141 return err;
3142}
3143
3144SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
3145{
3146 return ksys_unshare(unshare_flags);
3147}
3148
3149
3150
3151
3152
3153
3154
3155int unshare_files(void)
3156{
3157 struct task_struct *task = current;
3158 struct files_struct *old, *copy = NULL;
3159 int error;
3160
3161 error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, ©);
3162 if (error || !copy)
3163 return error;
3164
3165 old = task->files;
3166 task_lock(task);
3167 task->files = copy;
3168 task_unlock(task);
3169 put_files_struct(old);
3170 return 0;
3171}
3172
3173int sysctl_max_threads(struct ctl_table *table, int write,
3174 void *buffer, size_t *lenp, loff_t *ppos)
3175{
3176 struct ctl_table t;
3177 int ret;
3178 int threads = max_threads;
3179 int min = 1;
3180 int max = MAX_THREADS;
3181
3182 t = *table;
3183 t.data = &threads;
3184 t.extra1 = &min;
3185 t.extra2 = &max;
3186
3187 ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
3188 if (ret || !write)
3189 return ret;
3190
3191 max_threads = threads;
3192
3193 return 0;
3194}
3195