1
2
3
4
5
6
7
8
9
10
11
12
13
14
15#include <linux/anon_inodes.h>
16#include <linux/slab.h>
17#include <linux/sched/autogroup.h>
18#include <linux/sched/mm.h>
19#include <linux/sched/coredump.h>
20#include <linux/sched/user.h>
21#include <linux/sched/numa_balancing.h>
22#include <linux/sched/stat.h>
23#include <linux/sched/task.h>
24#include <linux/sched/task_stack.h>
25#include <linux/sched/cputime.h>
26#include <linux/seq_file.h>
27#include <linux/rtmutex.h>
28#include <linux/init.h>
29#include <linux/unistd.h>
30#include <linux/module.h>
31#include <linux/vmalloc.h>
32#include <linux/completion.h>
33#include <linux/personality.h>
34#include <linux/mempolicy.h>
35#include <linux/sem.h>
36#include <linux/file.h>
37#include <linux/fdtable.h>
38#include <linux/iocontext.h>
39#include <linux/key.h>
40#include <linux/binfmts.h>
41#include <linux/mman.h>
42#include <linux/mmu_notifier.h>
43#include <linux/fs.h>
44#include <linux/mm.h>
45#include <linux/vmacache.h>
46#include <linux/nsproxy.h>
47#include <linux/capability.h>
48#include <linux/cpu.h>
49#include <linux/cgroup.h>
50#include <linux/security.h>
51#include <linux/hugetlb.h>
52#include <linux/seccomp.h>
53#include <linux/swap.h>
54#include <linux/syscalls.h>
55#include <linux/jiffies.h>
56#include <linux/futex.h>
57#include <linux/compat.h>
58#include <linux/kthread.h>
59#include <linux/task_io_accounting_ops.h>
60#include <linux/rcupdate.h>
61#include <linux/ptrace.h>
62#include <linux/mount.h>
63#include <linux/audit.h>
64#include <linux/memcontrol.h>
65#include <linux/ftrace.h>
66#include <linux/proc_fs.h>
67#include <linux/profile.h>
68#include <linux/rmap.h>
69#include <linux/ksm.h>
70#include <linux/acct.h>
71#include <linux/userfaultfd_k.h>
72#include <linux/tsacct_kern.h>
73#include <linux/cn_proc.h>
74#include <linux/freezer.h>
75#include <linux/delayacct.h>
76#include <linux/taskstats_kern.h>
77#include <linux/random.h>
78#include <linux/tty.h>
79#include <linux/blkdev.h>
80#include <linux/fs_struct.h>
81#include <linux/magic.h>
82#include <linux/perf_event.h>
83#include <linux/posix-timers.h>
84#include <linux/user-return-notifier.h>
85#include <linux/oom.h>
86#include <linux/khugepaged.h>
87#include <linux/signalfd.h>
88#include <linux/uprobes.h>
89#include <linux/aio.h>
90#include <linux/compiler.h>
91#include <linux/sysctl.h>
92#include <linux/kcov.h>
93#include <linux/livepatch.h>
94#include <linux/thread_info.h>
95#include <linux/stackleak.h>
96#include <linux/kasan.h>
97#include <linux/scs.h>
98#include <linux/io_uring.h>
99#include <linux/bpf.h>
100
101#include <asm/pgalloc.h>
102#include <linux/uaccess.h>
103#include <asm/mmu_context.h>
104#include <asm/cacheflush.h>
105#include <asm/tlbflush.h>
106
107#include <trace/events/sched.h>
108
109#define CREATE_TRACE_POINTS
110#include <trace/events/task.h>
111
112
113
114
115#define MIN_THREADS 20
116
117
118
119
120#define MAX_THREADS FUTEX_TID_MASK
121
122
123
124
125unsigned long total_forks;
126int nr_threads;
127
128static int max_threads;
129
130#define NAMED_ARRAY_INDEX(x) [x] = __stringify(x)
131
132static const char * const resident_page_types[] = {
133 NAMED_ARRAY_INDEX(MM_FILEPAGES),
134 NAMED_ARRAY_INDEX(MM_ANONPAGES),
135 NAMED_ARRAY_INDEX(MM_SWAPENTS),
136 NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
137};
138
139DEFINE_PER_CPU(unsigned long, process_counts) = 0;
140
141__cacheline_aligned DEFINE_RWLOCK(tasklist_lock);
142
143#ifdef CONFIG_PROVE_RCU
144int lockdep_tasklist_lock_is_held(void)
145{
146 return lockdep_is_held(&tasklist_lock);
147}
148EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
149#endif
150
151int nr_processes(void)
152{
153 int cpu;
154 int total = 0;
155
156 for_each_possible_cpu(cpu)
157 total += per_cpu(process_counts, cpu);
158
159 return total;
160}
161
162void __weak arch_release_task_struct(struct task_struct *tsk)
163{
164}
165
166#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
167static struct kmem_cache *task_struct_cachep;
168
169static inline struct task_struct *alloc_task_struct_node(int node)
170{
171 return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
172}
173
174static inline void free_task_struct(struct task_struct *tsk)
175{
176 kmem_cache_free(task_struct_cachep, tsk);
177}
178#endif
179
180#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
181
182
183
184
185
186# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
187
188#ifdef CONFIG_VMAP_STACK
189
190
191
192
193#define NR_CACHED_STACKS 2
194static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
195
196static int free_vm_stack_cache(unsigned int cpu)
197{
198 struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
199 int i;
200
201 for (i = 0; i < NR_CACHED_STACKS; i++) {
202 struct vm_struct *vm_stack = cached_vm_stacks[i];
203
204 if (!vm_stack)
205 continue;
206
207 vfree(vm_stack->addr);
208 cached_vm_stacks[i] = NULL;
209 }
210
211 return 0;
212}
213#endif
214
215static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
216{
217#ifdef CONFIG_VMAP_STACK
218 void *stack;
219 int i;
220
221 for (i = 0; i < NR_CACHED_STACKS; i++) {
222 struct vm_struct *s;
223
224 s = this_cpu_xchg(cached_stacks[i], NULL);
225
226 if (!s)
227 continue;
228
229
230 kasan_unpoison_range(s->addr, THREAD_SIZE);
231
232
233 memset(s->addr, 0, THREAD_SIZE);
234
235 tsk->stack_vm_area = s;
236 tsk->stack = s->addr;
237 return s->addr;
238 }
239
240
241
242
243
244
245 stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
246 VMALLOC_START, VMALLOC_END,
247 THREADINFO_GFP & ~__GFP_ACCOUNT,
248 PAGE_KERNEL,
249 0, node, __builtin_return_address(0));
250
251
252
253
254
255
256 if (stack) {
257 tsk->stack_vm_area = find_vm_area(stack);
258 tsk->stack = stack;
259 }
260 return stack;
261#else
262 struct page *page = alloc_pages_node(node, THREADINFO_GFP,
263 THREAD_SIZE_ORDER);
264
265 if (likely(page)) {
266 tsk->stack = kasan_reset_tag(page_address(page));
267 return tsk->stack;
268 }
269 return NULL;
270#endif
271}
272
273static inline void free_thread_stack(struct task_struct *tsk)
274{
275#ifdef CONFIG_VMAP_STACK
276 struct vm_struct *vm = task_stack_vm_area(tsk);
277
278 if (vm) {
279 int i;
280
281 for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
282 memcg_kmem_uncharge_page(vm->pages[i], 0);
283
284 for (i = 0; i < NR_CACHED_STACKS; i++) {
285 if (this_cpu_cmpxchg(cached_stacks[i],
286 NULL, tsk->stack_vm_area) != NULL)
287 continue;
288
289 return;
290 }
291
292 vfree_atomic(tsk->stack);
293 return;
294 }
295#endif
296
297 __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
298}
299# else
300static struct kmem_cache *thread_stack_cache;
301
302static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
303 int node)
304{
305 unsigned long *stack;
306 stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
307 stack = kasan_reset_tag(stack);
308 tsk->stack = stack;
309 return stack;
310}
311
312static void free_thread_stack(struct task_struct *tsk)
313{
314 kmem_cache_free(thread_stack_cache, tsk->stack);
315}
316
317void thread_stack_cache_init(void)
318{
319 thread_stack_cache = kmem_cache_create_usercopy("thread_stack",
320 THREAD_SIZE, THREAD_SIZE, 0, 0,
321 THREAD_SIZE, NULL);
322 BUG_ON(thread_stack_cache == NULL);
323}
324# endif
325#endif
326
327
328static struct kmem_cache *signal_cachep;
329
330
331struct kmem_cache *sighand_cachep;
332
333
334struct kmem_cache *files_cachep;
335
336
337struct kmem_cache *fs_cachep;
338
339
340static struct kmem_cache *vm_area_cachep;
341
342
343static struct kmem_cache *mm_cachep;
344
345struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
346{
347 struct vm_area_struct *vma;
348
349 vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
350 if (vma)
351 vma_init(vma, mm);
352 return vma;
353}
354
355struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
356{
357 struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
358
359 if (new) {
360 ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
361 ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
362
363
364
365
366 *new = data_race(*orig);
367 INIT_LIST_HEAD(&new->anon_vma_chain);
368 new->vm_next = new->vm_prev = NULL;
369 }
370 return new;
371}
372
373void vm_area_free(struct vm_area_struct *vma)
374{
375 kmem_cache_free(vm_area_cachep, vma);
376}
377
378static void account_kernel_stack(struct task_struct *tsk, int account)
379{
380 void *stack = task_stack_page(tsk);
381 struct vm_struct *vm = task_stack_vm_area(tsk);
382
383 if (vm) {
384 int i;
385
386 for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
387 mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB,
388 account * (PAGE_SIZE / 1024));
389 } else {
390
391 mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB,
392 account * (THREAD_SIZE / 1024));
393 }
394}
395
396static int memcg_charge_kernel_stack(struct task_struct *tsk)
397{
398#ifdef CONFIG_VMAP_STACK
399 struct vm_struct *vm = task_stack_vm_area(tsk);
400 int ret;
401
402 BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
403
404 if (vm) {
405 int i;
406
407 BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
408
409 for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
410
411
412
413
414
415
416 ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
417 0);
418 if (ret)
419 return ret;
420 }
421 }
422#endif
423 return 0;
424}
425
426static void release_task_stack(struct task_struct *tsk)
427{
428 if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
429 return;
430
431 account_kernel_stack(tsk, -1);
432 free_thread_stack(tsk);
433 tsk->stack = NULL;
434#ifdef CONFIG_VMAP_STACK
435 tsk->stack_vm_area = NULL;
436#endif
437}
438
439#ifdef CONFIG_THREAD_INFO_IN_TASK
440void put_task_stack(struct task_struct *tsk)
441{
442 if (refcount_dec_and_test(&tsk->stack_refcount))
443 release_task_stack(tsk);
444}
445#endif
446
447void free_task(struct task_struct *tsk)
448{
449 release_user_cpus_ptr(tsk);
450 scs_release(tsk);
451
452#ifndef CONFIG_THREAD_INFO_IN_TASK
453
454
455
456
457 release_task_stack(tsk);
458#else
459
460
461
462
463 WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
464#endif
465 rt_mutex_debug_task_free(tsk);
466 ftrace_graph_exit_task(tsk);
467 arch_release_task_struct(tsk);
468 if (tsk->flags & PF_KTHREAD)
469 free_kthread_struct(tsk);
470 free_task_struct(tsk);
471}
472EXPORT_SYMBOL(free_task);
473
474static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
475{
476 struct file *exe_file;
477
478 exe_file = get_mm_exe_file(oldmm);
479 RCU_INIT_POINTER(mm->exe_file, exe_file);
480
481
482
483
484 if (exe_file && deny_write_access(exe_file))
485 pr_warn_once("deny_write_access() failed in %s\n", __func__);
486}
487
488#ifdef CONFIG_MMU
489static __latent_entropy int dup_mmap(struct mm_struct *mm,
490 struct mm_struct *oldmm)
491{
492 struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
493 struct rb_node **rb_link, *rb_parent;
494 int retval;
495 unsigned long charge;
496 LIST_HEAD(uf);
497
498 uprobe_start_dup_mmap();
499 if (mmap_write_lock_killable(oldmm)) {
500 retval = -EINTR;
501 goto fail_uprobe_end;
502 }
503 flush_cache_dup_mm(oldmm);
504 uprobe_dup_mmap(oldmm, mm);
505
506
507
508 mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
509
510
511 dup_mm_exe_file(mm, oldmm);
512
513 mm->total_vm = oldmm->total_vm;
514 mm->data_vm = oldmm->data_vm;
515 mm->exec_vm = oldmm->exec_vm;
516 mm->stack_vm = oldmm->stack_vm;
517
518 rb_link = &mm->mm_rb.rb_node;
519 rb_parent = NULL;
520 pprev = &mm->mmap;
521 retval = ksm_fork(mm, oldmm);
522 if (retval)
523 goto out;
524 retval = khugepaged_fork(mm, oldmm);
525 if (retval)
526 goto out;
527
528 prev = NULL;
529 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
530 struct file *file;
531
532 if (mpnt->vm_flags & VM_DONTCOPY) {
533 vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
534 continue;
535 }
536 charge = 0;
537
538
539
540
541 if (fatal_signal_pending(current)) {
542 retval = -EINTR;
543 goto out;
544 }
545 if (mpnt->vm_flags & VM_ACCOUNT) {
546 unsigned long len = vma_pages(mpnt);
547
548 if (security_vm_enough_memory_mm(oldmm, len))
549 goto fail_nomem;
550 charge = len;
551 }
552 tmp = vm_area_dup(mpnt);
553 if (!tmp)
554 goto fail_nomem;
555 retval = vma_dup_policy(mpnt, tmp);
556 if (retval)
557 goto fail_nomem_policy;
558 tmp->vm_mm = mm;
559 retval = dup_userfaultfd(tmp, &uf);
560 if (retval)
561 goto fail_nomem_anon_vma_fork;
562 if (tmp->vm_flags & VM_WIPEONFORK) {
563
564
565
566
567
568 tmp->anon_vma = NULL;
569 } else if (anon_vma_fork(tmp, mpnt))
570 goto fail_nomem_anon_vma_fork;
571 tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
572 file = tmp->vm_file;
573 if (file) {
574 struct address_space *mapping = file->f_mapping;
575
576 get_file(file);
577 i_mmap_lock_write(mapping);
578 if (tmp->vm_flags & VM_SHARED)
579 mapping_allow_writable(mapping);
580 flush_dcache_mmap_lock(mapping);
581
582 vma_interval_tree_insert_after(tmp, mpnt,
583 &mapping->i_mmap);
584 flush_dcache_mmap_unlock(mapping);
585 i_mmap_unlock_write(mapping);
586 }
587
588
589
590
591
592
593 if (is_vm_hugetlb_page(tmp))
594 reset_vma_resv_huge_pages(tmp);
595
596
597
598
599 *pprev = tmp;
600 pprev = &tmp->vm_next;
601 tmp->vm_prev = prev;
602 prev = tmp;
603
604 __vma_link_rb(mm, tmp, rb_link, rb_parent);
605 rb_link = &tmp->vm_rb.rb_right;
606 rb_parent = &tmp->vm_rb;
607
608 mm->map_count++;
609 if (!(tmp->vm_flags & VM_WIPEONFORK))
610 retval = copy_page_range(tmp, mpnt);
611
612 if (tmp->vm_ops && tmp->vm_ops->open)
613 tmp->vm_ops->open(tmp);
614
615 if (retval)
616 goto out;
617 }
618
619 retval = arch_dup_mmap(oldmm, mm);
620out:
621 mmap_write_unlock(mm);
622 flush_tlb_mm(oldmm);
623 mmap_write_unlock(oldmm);
624 dup_userfaultfd_complete(&uf);
625fail_uprobe_end:
626 uprobe_end_dup_mmap();
627 return retval;
628fail_nomem_anon_vma_fork:
629 mpol_put(vma_policy(tmp));
630fail_nomem_policy:
631 vm_area_free(tmp);
632fail_nomem:
633 retval = -ENOMEM;
634 vm_unacct_memory(charge);
635 goto out;
636}
637
638static inline int mm_alloc_pgd(struct mm_struct *mm)
639{
640 mm->pgd = pgd_alloc(mm);
641 if (unlikely(!mm->pgd))
642 return -ENOMEM;
643 return 0;
644}
645
646static inline void mm_free_pgd(struct mm_struct *mm)
647{
648 pgd_free(mm, mm->pgd);
649}
650#else
651static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
652{
653 mmap_write_lock(oldmm);
654 dup_mm_exe_file(mm, oldmm);
655 mmap_write_unlock(oldmm);
656 return 0;
657}
658#define mm_alloc_pgd(mm) (0)
659#define mm_free_pgd(mm)
660#endif
661
662static void check_mm(struct mm_struct *mm)
663{
664 int i;
665
666 BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
667 "Please make sure 'struct resident_page_types[]' is updated as well");
668
669 for (i = 0; i < NR_MM_COUNTERS; i++) {
670 long x = atomic_long_read(&mm->rss_stat.count[i]);
671
672 if (unlikely(x))
673 pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
674 mm, resident_page_types[i], x);
675 }
676
677 if (mm_pgtables_bytes(mm))
678 pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
679 mm_pgtables_bytes(mm));
680
681#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
682 VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
683#endif
684}
685
686#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
687#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
688
689
690
691
692
693
694void __mmdrop(struct mm_struct *mm)
695{
696 BUG_ON(mm == &init_mm);
697 WARN_ON_ONCE(mm == current->mm);
698 WARN_ON_ONCE(mm == current->active_mm);
699 mm_free_pgd(mm);
700 destroy_context(mm);
701 mmu_notifier_subscriptions_destroy(mm);
702 check_mm(mm);
703 put_user_ns(mm->user_ns);
704 free_mm(mm);
705}
706EXPORT_SYMBOL_GPL(__mmdrop);
707
708static void mmdrop_async_fn(struct work_struct *work)
709{
710 struct mm_struct *mm;
711
712 mm = container_of(work, struct mm_struct, async_put_work);
713 __mmdrop(mm);
714}
715
716static void mmdrop_async(struct mm_struct *mm)
717{
718 if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
719 INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
720 schedule_work(&mm->async_put_work);
721 }
722}
723
724static inline void free_signal_struct(struct signal_struct *sig)
725{
726 taskstats_tgid_free(sig);
727 sched_autogroup_exit(sig);
728
729
730
731
732 if (sig->oom_mm)
733 mmdrop_async(sig->oom_mm);
734 kmem_cache_free(signal_cachep, sig);
735}
736
737static inline void put_signal_struct(struct signal_struct *sig)
738{
739 if (refcount_dec_and_test(&sig->sigcnt))
740 free_signal_struct(sig);
741}
742
743void __put_task_struct(struct task_struct *tsk)
744{
745 WARN_ON(!tsk->exit_state);
746 WARN_ON(refcount_read(&tsk->usage));
747 WARN_ON(tsk == current);
748
749 io_uring_free(tsk);
750 cgroup_free(tsk);
751 task_numa_free(tsk, true);
752 security_task_free(tsk);
753 bpf_task_storage_free(tsk);
754 exit_creds(tsk);
755 delayacct_tsk_free(tsk);
756 put_signal_struct(tsk->signal);
757 sched_core_free(tsk);
758
759 if (!profile_handoff_task(tsk))
760 free_task(tsk);
761}
762EXPORT_SYMBOL_GPL(__put_task_struct);
763
764void __init __weak arch_task_cache_init(void) { }
765
766
767
768
769static void set_max_threads(unsigned int max_threads_suggested)
770{
771 u64 threads;
772 unsigned long nr_pages = totalram_pages();
773
774
775
776
777
778 if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64)
779 threads = MAX_THREADS;
780 else
781 threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE,
782 (u64) THREAD_SIZE * 8UL);
783
784 if (threads > max_threads_suggested)
785 threads = max_threads_suggested;
786
787 max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
788}
789
790#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
791
792int arch_task_struct_size __read_mostly;
793#endif
794
795#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
796static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
797{
798
799 arch_thread_struct_whitelist(offset, size);
800
801
802
803
804
805 if (unlikely(*size == 0))
806 *offset = 0;
807 else
808 *offset += offsetof(struct task_struct, thread);
809}
810#endif
811
812void __init fork_init(void)
813{
814 int i;
815#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
816#ifndef ARCH_MIN_TASKALIGN
817#define ARCH_MIN_TASKALIGN 0
818#endif
819 int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
820 unsigned long useroffset, usersize;
821
822
823 task_struct_whitelist(&useroffset, &usersize);
824 task_struct_cachep = kmem_cache_create_usercopy("task_struct",
825 arch_task_struct_size, align,
826 SLAB_PANIC|SLAB_ACCOUNT,
827 useroffset, usersize, NULL);
828#endif
829
830
831 arch_task_cache_init();
832
833 set_max_threads(MAX_THREADS);
834
835 init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
836 init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
837 init_task.signal->rlim[RLIMIT_SIGPENDING] =
838 init_task.signal->rlim[RLIMIT_NPROC];
839
840 for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++)
841 init_user_ns.ucount_max[i] = max_threads/2;
842
843 set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, RLIM_INFINITY);
844 set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, RLIM_INFINITY);
845 set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY);
846 set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, RLIM_INFINITY);
847
848#ifdef CONFIG_VMAP_STACK
849 cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
850 NULL, free_vm_stack_cache);
851#endif
852
853 scs_init();
854
855 lockdep_init_task(&init_task);
856 uprobes_init();
857}
858
859int __weak arch_dup_task_struct(struct task_struct *dst,
860 struct task_struct *src)
861{
862 *dst = *src;
863 return 0;
864}
865
866void set_task_stack_end_magic(struct task_struct *tsk)
867{
868 unsigned long *stackend;
869
870 stackend = end_of_stack(tsk);
871 *stackend = STACK_END_MAGIC;
872}
873
874static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
875{
876 struct task_struct *tsk;
877 unsigned long *stack;
878 struct vm_struct *stack_vm_area __maybe_unused;
879 int err;
880
881 if (node == NUMA_NO_NODE)
882 node = tsk_fork_get_node(orig);
883 tsk = alloc_task_struct_node(node);
884 if (!tsk)
885 return NULL;
886
887 stack = alloc_thread_stack_node(tsk, node);
888 if (!stack)
889 goto free_tsk;
890
891 if (memcg_charge_kernel_stack(tsk))
892 goto free_stack;
893
894 stack_vm_area = task_stack_vm_area(tsk);
895
896 err = arch_dup_task_struct(tsk, orig);
897
898
899
900
901
902
903 tsk->stack = stack;
904#ifdef CONFIG_VMAP_STACK
905 tsk->stack_vm_area = stack_vm_area;
906#endif
907#ifdef CONFIG_THREAD_INFO_IN_TASK
908 refcount_set(&tsk->stack_refcount, 1);
909#endif
910
911 if (err)
912 goto free_stack;
913
914 err = scs_prepare(tsk, node);
915 if (err)
916 goto free_stack;
917
918#ifdef CONFIG_SECCOMP
919
920
921
922
923
924
925 tsk->seccomp.filter = NULL;
926#endif
927
928 setup_thread_stack(tsk, orig);
929 clear_user_return_notifier(tsk);
930 clear_tsk_need_resched(tsk);
931 set_task_stack_end_magic(tsk);
932 clear_syscall_work_syscall_user_dispatch(tsk);
933
934#ifdef CONFIG_STACKPROTECTOR
935 tsk->stack_canary = get_random_canary();
936#endif
937 if (orig->cpus_ptr == &orig->cpus_mask)
938 tsk->cpus_ptr = &tsk->cpus_mask;
939 dup_user_cpus_ptr(tsk, orig, node);
940
941
942
943
944
945 refcount_set(&tsk->rcu_users, 2);
946
947 refcount_set(&tsk->usage, 1);
948#ifdef CONFIG_BLK_DEV_IO_TRACE
949 tsk->btrace_seq = 0;
950#endif
951 tsk->splice_pipe = NULL;
952 tsk->task_frag.page = NULL;
953 tsk->wake_q.next = NULL;
954 tsk->pf_io_worker = NULL;
955
956 account_kernel_stack(tsk, 1);
957
958 kcov_task_init(tsk);
959 kmap_local_fork(tsk);
960
961#ifdef CONFIG_FAULT_INJECTION
962 tsk->fail_nth = 0;
963#endif
964
965#ifdef CONFIG_BLK_CGROUP
966 tsk->throttle_queue = NULL;
967 tsk->use_memdelay = 0;
968#endif
969
970#ifdef CONFIG_MEMCG
971 tsk->active_memcg = NULL;
972#endif
973 return tsk;
974
975free_stack:
976 free_thread_stack(tsk);
977free_tsk:
978 free_task_struct(tsk);
979 return NULL;
980}
981
982__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
983
984static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
985
986static int __init coredump_filter_setup(char *s)
987{
988 default_dump_filter =
989 (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
990 MMF_DUMP_FILTER_MASK;
991 return 1;
992}
993
994__setup("coredump_filter=", coredump_filter_setup);
995
996#include <linux/init_task.h>
997
998static void mm_init_aio(struct mm_struct *mm)
999{
1000#ifdef CONFIG_AIO
1001 spin_lock_init(&mm->ioctx_lock);
1002 mm->ioctx_table = NULL;
1003#endif
1004}
1005
1006static __always_inline void mm_clear_owner(struct mm_struct *mm,
1007 struct task_struct *p)
1008{
1009#ifdef CONFIG_MEMCG
1010 if (mm->owner == p)
1011 WRITE_ONCE(mm->owner, NULL);
1012#endif
1013}
1014
1015static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
1016{
1017#ifdef CONFIG_MEMCG
1018 mm->owner = p;
1019#endif
1020}
1021
1022static void mm_init_pasid(struct mm_struct *mm)
1023{
1024#ifdef CONFIG_IOMMU_SUPPORT
1025 mm->pasid = INIT_PASID;
1026#endif
1027}
1028
1029static void mm_init_uprobes_state(struct mm_struct *mm)
1030{
1031#ifdef CONFIG_UPROBES
1032 mm->uprobes_state.xol_area = NULL;
1033#endif
1034}
1035
1036static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
1037 struct user_namespace *user_ns)
1038{
1039 mm->mmap = NULL;
1040 mm->mm_rb = RB_ROOT;
1041 mm->vmacache_seqnum = 0;
1042 atomic_set(&mm->mm_users, 1);
1043 atomic_set(&mm->mm_count, 1);
1044 seqcount_init(&mm->write_protect_seq);
1045 mmap_init_lock(mm);
1046 INIT_LIST_HEAD(&mm->mmlist);
1047 mm->core_state = NULL;
1048 mm_pgtables_bytes_init(mm);
1049 mm->map_count = 0;
1050 mm->locked_vm = 0;
1051 atomic64_set(&mm->pinned_vm, 0);
1052 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
1053 spin_lock_init(&mm->page_table_lock);
1054 spin_lock_init(&mm->arg_lock);
1055 mm_init_cpumask(mm);
1056 mm_init_aio(mm);
1057 mm_init_owner(mm, p);
1058 mm_init_pasid(mm);
1059 RCU_INIT_POINTER(mm->exe_file, NULL);
1060 mmu_notifier_subscriptions_init(mm);
1061 init_tlb_flush_pending(mm);
1062#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
1063 mm->pmd_huge_pte = NULL;
1064#endif
1065 mm_init_uprobes_state(mm);
1066 hugetlb_count_init(mm);
1067
1068 if (current->mm) {
1069 mm->flags = current->mm->flags & MMF_INIT_MASK;
1070 mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
1071 } else {
1072 mm->flags = default_dump_filter;
1073 mm->def_flags = 0;
1074 }
1075
1076 if (mm_alloc_pgd(mm))
1077 goto fail_nopgd;
1078
1079 if (init_new_context(p, mm))
1080 goto fail_nocontext;
1081
1082 mm->user_ns = get_user_ns(user_ns);
1083 return mm;
1084
1085fail_nocontext:
1086 mm_free_pgd(mm);
1087fail_nopgd:
1088 free_mm(mm);
1089 return NULL;
1090}
1091
1092
1093
1094
1095struct mm_struct *mm_alloc(void)
1096{
1097 struct mm_struct *mm;
1098
1099 mm = allocate_mm();
1100 if (!mm)
1101 return NULL;
1102
1103 memset(mm, 0, sizeof(*mm));
1104 return mm_init(mm, current, current_user_ns());
1105}
1106
1107static inline void __mmput(struct mm_struct *mm)
1108{
1109 VM_BUG_ON(atomic_read(&mm->mm_users));
1110
1111 uprobe_clear_state(mm);
1112 exit_aio(mm);
1113 ksm_exit(mm);
1114 khugepaged_exit(mm);
1115 exit_mmap(mm);
1116 mm_put_huge_zero_page(mm);
1117 set_mm_exe_file(mm, NULL);
1118 if (!list_empty(&mm->mmlist)) {
1119 spin_lock(&mmlist_lock);
1120 list_del(&mm->mmlist);
1121 spin_unlock(&mmlist_lock);
1122 }
1123 if (mm->binfmt)
1124 module_put(mm->binfmt->module);
1125 mmdrop(mm);
1126}
1127
1128
1129
1130
1131void mmput(struct mm_struct *mm)
1132{
1133 might_sleep();
1134
1135 if (atomic_dec_and_test(&mm->mm_users))
1136 __mmput(mm);
1137}
1138EXPORT_SYMBOL_GPL(mmput);
1139
1140#ifdef CONFIG_MMU
1141static void mmput_async_fn(struct work_struct *work)
1142{
1143 struct mm_struct *mm = container_of(work, struct mm_struct,
1144 async_put_work);
1145
1146 __mmput(mm);
1147}
1148
1149void mmput_async(struct mm_struct *mm)
1150{
1151 if (atomic_dec_and_test(&mm->mm_users)) {
1152 INIT_WORK(&mm->async_put_work, mmput_async_fn);
1153 schedule_work(&mm->async_put_work);
1154 }
1155}
1156#endif
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
1170{
1171 struct file *old_exe_file;
1172
1173
1174
1175
1176
1177
1178 old_exe_file = rcu_dereference_raw(mm->exe_file);
1179
1180 if (new_exe_file) {
1181
1182
1183
1184
1185 if (unlikely(deny_write_access(new_exe_file)))
1186 return -EACCES;
1187 get_file(new_exe_file);
1188 }
1189 rcu_assign_pointer(mm->exe_file, new_exe_file);
1190 if (old_exe_file) {
1191 allow_write_access(old_exe_file);
1192 fput(old_exe_file);
1193 }
1194 return 0;
1195}
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
1207{
1208 struct vm_area_struct *vma;
1209 struct file *old_exe_file;
1210 int ret = 0;
1211
1212
1213 old_exe_file = get_mm_exe_file(mm);
1214 if (old_exe_file) {
1215 mmap_read_lock(mm);
1216 for (vma = mm->mmap; vma && !ret; vma = vma->vm_next) {
1217 if (!vma->vm_file)
1218 continue;
1219 if (path_equal(&vma->vm_file->f_path,
1220 &old_exe_file->f_path))
1221 ret = -EBUSY;
1222 }
1223 mmap_read_unlock(mm);
1224 fput(old_exe_file);
1225 if (ret)
1226 return ret;
1227 }
1228
1229
1230 ret = deny_write_access(new_exe_file);
1231 if (ret)
1232 return -EACCES;
1233 get_file(new_exe_file);
1234
1235 old_exe_file = xchg(&mm->exe_file, new_exe_file);
1236 if (old_exe_file) {
1237
1238
1239
1240
1241 mmap_read_lock(mm);
1242 allow_write_access(old_exe_file);
1243 fput(old_exe_file);
1244 mmap_read_unlock(mm);
1245 }
1246 return 0;
1247}
1248
1249
1250
1251
1252
1253
1254
1255struct file *get_mm_exe_file(struct mm_struct *mm)
1256{
1257 struct file *exe_file;
1258
1259 rcu_read_lock();
1260 exe_file = rcu_dereference(mm->exe_file);
1261 if (exe_file && !get_file_rcu(exe_file))
1262 exe_file = NULL;
1263 rcu_read_unlock();
1264 return exe_file;
1265}
1266
1267
1268
1269
1270
1271
1272
1273
1274struct file *get_task_exe_file(struct task_struct *task)
1275{
1276 struct file *exe_file = NULL;
1277 struct mm_struct *mm;
1278
1279 task_lock(task);
1280 mm = task->mm;
1281 if (mm) {
1282 if (!(task->flags & PF_KTHREAD))
1283 exe_file = get_mm_exe_file(mm);
1284 }
1285 task_unlock(task);
1286 return exe_file;
1287}
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298struct mm_struct *get_task_mm(struct task_struct *task)
1299{
1300 struct mm_struct *mm;
1301
1302 task_lock(task);
1303 mm = task->mm;
1304 if (mm) {
1305 if (task->flags & PF_KTHREAD)
1306 mm = NULL;
1307 else
1308 mmget(mm);
1309 }
1310 task_unlock(task);
1311 return mm;
1312}
1313EXPORT_SYMBOL_GPL(get_task_mm);
1314
1315struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
1316{
1317 struct mm_struct *mm;
1318 int err;
1319
1320 err = down_read_killable(&task->signal->exec_update_lock);
1321 if (err)
1322 return ERR_PTR(err);
1323
1324 mm = get_task_mm(task);
1325 if (mm && mm != current->mm &&
1326 !ptrace_may_access(task, mode)) {
1327 mmput(mm);
1328 mm = ERR_PTR(-EACCES);
1329 }
1330 up_read(&task->signal->exec_update_lock);
1331
1332 return mm;
1333}
1334
1335static void complete_vfork_done(struct task_struct *tsk)
1336{
1337 struct completion *vfork;
1338
1339 task_lock(tsk);
1340 vfork = tsk->vfork_done;
1341 if (likely(vfork)) {
1342 tsk->vfork_done = NULL;
1343 complete(vfork);
1344 }
1345 task_unlock(tsk);
1346}
1347
1348static int wait_for_vfork_done(struct task_struct *child,
1349 struct completion *vfork)
1350{
1351 int killed;
1352
1353 freezer_do_not_count();
1354 cgroup_enter_frozen();
1355 killed = wait_for_completion_killable(vfork);
1356 cgroup_leave_frozen(false);
1357 freezer_count();
1358
1359 if (killed) {
1360 task_lock(child);
1361 child->vfork_done = NULL;
1362 task_unlock(child);
1363 }
1364
1365 put_task_struct(child);
1366 return killed;
1367}
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
1383{
1384 uprobe_free_utask(tsk);
1385
1386
1387 deactivate_mm(tsk, mm);
1388
1389
1390
1391
1392
1393
1394 if (tsk->clear_child_tid) {
1395 if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) &&
1396 atomic_read(&mm->mm_users) > 1) {
1397
1398
1399
1400
1401 put_user(0, tsk->clear_child_tid);
1402 do_futex(tsk->clear_child_tid, FUTEX_WAKE,
1403 1, NULL, NULL, 0, 0);
1404 }
1405 tsk->clear_child_tid = NULL;
1406 }
1407
1408
1409
1410
1411
1412 if (tsk->vfork_done)
1413 complete_vfork_done(tsk);
1414}
1415
1416void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm)
1417{
1418 futex_exit_release(tsk);
1419 mm_release(tsk, mm);
1420}
1421
1422void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm)
1423{
1424 futex_exec_release(tsk);
1425 mm_release(tsk, mm);
1426}
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438static struct mm_struct *dup_mm(struct task_struct *tsk,
1439 struct mm_struct *oldmm)
1440{
1441 struct mm_struct *mm;
1442 int err;
1443
1444 mm = allocate_mm();
1445 if (!mm)
1446 goto fail_nomem;
1447
1448 memcpy(mm, oldmm, sizeof(*mm));
1449
1450 if (!mm_init(mm, tsk, mm->user_ns))
1451 goto fail_nomem;
1452
1453 err = dup_mmap(mm, oldmm);
1454 if (err)
1455 goto free_pt;
1456
1457 mm->hiwater_rss = get_mm_rss(mm);
1458 mm->hiwater_vm = mm->total_vm;
1459
1460 if (mm->binfmt && !try_module_get(mm->binfmt->module))
1461 goto free_pt;
1462
1463 return mm;
1464
1465free_pt:
1466
1467 mm->binfmt = NULL;
1468 mm_init_owner(mm, NULL);
1469 mmput(mm);
1470
1471fail_nomem:
1472 return NULL;
1473}
1474
1475static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
1476{
1477 struct mm_struct *mm, *oldmm;
1478
1479 tsk->min_flt = tsk->maj_flt = 0;
1480 tsk->nvcsw = tsk->nivcsw = 0;
1481#ifdef CONFIG_DETECT_HUNG_TASK
1482 tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
1483 tsk->last_switch_time = 0;
1484#endif
1485
1486 tsk->mm = NULL;
1487 tsk->active_mm = NULL;
1488
1489
1490
1491
1492
1493
1494 oldmm = current->mm;
1495 if (!oldmm)
1496 return 0;
1497
1498
1499 vmacache_flush(tsk);
1500
1501 if (clone_flags & CLONE_VM) {
1502 mmget(oldmm);
1503 mm = oldmm;
1504 } else {
1505 mm = dup_mm(tsk, current->mm);
1506 if (!mm)
1507 return -ENOMEM;
1508 }
1509
1510 tsk->mm = mm;
1511 tsk->active_mm = mm;
1512 return 0;
1513}
1514
1515static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
1516{
1517 struct fs_struct *fs = current->fs;
1518 if (clone_flags & CLONE_FS) {
1519
1520 spin_lock(&fs->lock);
1521 if (fs->in_exec) {
1522 spin_unlock(&fs->lock);
1523 return -EAGAIN;
1524 }
1525 fs->users++;
1526 spin_unlock(&fs->lock);
1527 return 0;
1528 }
1529 tsk->fs = copy_fs_struct(fs);
1530 if (!tsk->fs)
1531 return -ENOMEM;
1532 return 0;
1533}
1534
1535static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
1536{
1537 struct files_struct *oldf, *newf;
1538 int error = 0;
1539
1540
1541
1542
1543 oldf = current->files;
1544 if (!oldf)
1545 goto out;
1546
1547 if (clone_flags & CLONE_FILES) {
1548 atomic_inc(&oldf->count);
1549 goto out;
1550 }
1551
1552 newf = dup_fd(oldf, NR_OPEN_MAX, &error);
1553 if (!newf)
1554 goto out;
1555
1556 tsk->files = newf;
1557 error = 0;
1558out:
1559 return error;
1560}
1561
1562static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
1563{
1564#ifdef CONFIG_BLOCK
1565 struct io_context *ioc = current->io_context;
1566 struct io_context *new_ioc;
1567
1568 if (!ioc)
1569 return 0;
1570
1571
1572
1573 if (clone_flags & CLONE_IO) {
1574 ioc_task_link(ioc);
1575 tsk->io_context = ioc;
1576 } else if (ioprio_valid(ioc->ioprio)) {
1577 new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
1578 if (unlikely(!new_ioc))
1579 return -ENOMEM;
1580
1581 new_ioc->ioprio = ioc->ioprio;
1582 put_io_context(new_ioc);
1583 }
1584#endif
1585 return 0;
1586}
1587
1588static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
1589{
1590 struct sighand_struct *sig;
1591
1592 if (clone_flags & CLONE_SIGHAND) {
1593 refcount_inc(¤t->sighand->count);
1594 return 0;
1595 }
1596 sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
1597 RCU_INIT_POINTER(tsk->sighand, sig);
1598 if (!sig)
1599 return -ENOMEM;
1600
1601 refcount_set(&sig->count, 1);
1602 spin_lock_irq(¤t->sighand->siglock);
1603 memcpy(sig->action, current->sighand->action, sizeof(sig->action));
1604 spin_unlock_irq(¤t->sighand->siglock);
1605
1606
1607 if (clone_flags & CLONE_CLEAR_SIGHAND)
1608 flush_signal_handlers(tsk, 0);
1609
1610 return 0;
1611}
1612
1613void __cleanup_sighand(struct sighand_struct *sighand)
1614{
1615 if (refcount_dec_and_test(&sighand->count)) {
1616 signalfd_cleanup(sighand);
1617
1618
1619
1620
1621 kmem_cache_free(sighand_cachep, sighand);
1622 }
1623}
1624
1625
1626
1627
1628static void posix_cpu_timers_init_group(struct signal_struct *sig)
1629{
1630 struct posix_cputimers *pct = &sig->posix_cputimers;
1631 unsigned long cpu_limit;
1632
1633 cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
1634 posix_cputimers_group_init(pct, cpu_limit);
1635}
1636
1637static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1638{
1639 struct signal_struct *sig;
1640
1641 if (clone_flags & CLONE_THREAD)
1642 return 0;
1643
1644 sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
1645 tsk->signal = sig;
1646 if (!sig)
1647 return -ENOMEM;
1648
1649 sig->nr_threads = 1;
1650 atomic_set(&sig->live, 1);
1651 refcount_set(&sig->sigcnt, 1);
1652
1653
1654 sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
1655 tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head);
1656
1657 init_waitqueue_head(&sig->wait_chldexit);
1658 sig->curr_target = tsk;
1659 init_sigpending(&sig->shared_pending);
1660 INIT_HLIST_HEAD(&sig->multiprocess);
1661 seqlock_init(&sig->stats_lock);
1662 prev_cputime_init(&sig->prev_cputime);
1663
1664#ifdef CONFIG_POSIX_TIMERS
1665 INIT_LIST_HEAD(&sig->posix_timers);
1666 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1667 sig->real_timer.function = it_real_fn;
1668#endif
1669
1670 task_lock(current->group_leader);
1671 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
1672 task_unlock(current->group_leader);
1673
1674 posix_cpu_timers_init_group(sig);
1675
1676 tty_audit_fork(sig);
1677 sched_autogroup_fork(sig);
1678
1679 sig->oom_score_adj = current->signal->oom_score_adj;
1680 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
1681
1682 mutex_init(&sig->cred_guard_mutex);
1683 init_rwsem(&sig->exec_update_lock);
1684
1685 return 0;
1686}
1687
1688static void copy_seccomp(struct task_struct *p)
1689{
1690#ifdef CONFIG_SECCOMP
1691
1692
1693
1694
1695
1696
1697 assert_spin_locked(¤t->sighand->siglock);
1698
1699
1700 get_seccomp_filter(current);
1701 p->seccomp = current->seccomp;
1702
1703
1704
1705
1706
1707
1708 if (task_no_new_privs(current))
1709 task_set_no_new_privs(p);
1710
1711
1712
1713
1714
1715
1716 if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
1717 set_task_syscall_work(p, SECCOMP);
1718#endif
1719}
1720
1721SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
1722{
1723 current->clear_child_tid = tidptr;
1724
1725 return task_pid_vnr(current);
1726}
1727
1728static void rt_mutex_init_task(struct task_struct *p)
1729{
1730 raw_spin_lock_init(&p->pi_lock);
1731#ifdef CONFIG_RT_MUTEXES
1732 p->pi_waiters = RB_ROOT_CACHED;
1733 p->pi_top_task = NULL;
1734 p->pi_blocked_on = NULL;
1735#endif
1736}
1737
1738static inline void init_task_pid_links(struct task_struct *task)
1739{
1740 enum pid_type type;
1741
1742 for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type)
1743 INIT_HLIST_NODE(&task->pid_links[type]);
1744}
1745
1746static inline void
1747init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
1748{
1749 if (type == PIDTYPE_PID)
1750 task->thread_pid = pid;
1751 else
1752 task->signal->pids[type] = pid;
1753}
1754
1755static inline void rcu_copy_process(struct task_struct *p)
1756{
1757#ifdef CONFIG_PREEMPT_RCU
1758 p->rcu_read_lock_nesting = 0;
1759 p->rcu_read_unlock_special.s = 0;
1760 p->rcu_blocked_node = NULL;
1761 INIT_LIST_HEAD(&p->rcu_node_entry);
1762#endif
1763#ifdef CONFIG_TASKS_RCU
1764 p->rcu_tasks_holdout = false;
1765 INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
1766 p->rcu_tasks_idle_cpu = -1;
1767#endif
1768#ifdef CONFIG_TASKS_TRACE_RCU
1769 p->trc_reader_nesting = 0;
1770 p->trc_reader_special.s = 0;
1771 INIT_LIST_HEAD(&p->trc_holdout_list);
1772#endif
1773}
1774
1775struct pid *pidfd_pid(const struct file *file)
1776{
1777 if (file->f_op == &pidfd_fops)
1778 return file->private_data;
1779
1780 return ERR_PTR(-EBADF);
1781}
1782
1783static int pidfd_release(struct inode *inode, struct file *file)
1784{
1785 struct pid *pid = file->private_data;
1786
1787 file->private_data = NULL;
1788 put_pid(pid);
1789 return 0;
1790}
1791
1792#ifdef CONFIG_PROC_FS
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
1829{
1830 struct pid *pid = f->private_data;
1831 struct pid_namespace *ns;
1832 pid_t nr = -1;
1833
1834 if (likely(pid_has_task(pid, PIDTYPE_PID))) {
1835 ns = proc_pid_ns(file_inode(m->file)->i_sb);
1836 nr = pid_nr_ns(pid, ns);
1837 }
1838
1839 seq_put_decimal_ll(m, "Pid:\t", nr);
1840
1841#ifdef CONFIG_PID_NS
1842 seq_put_decimal_ll(m, "\nNSpid:\t", nr);
1843 if (nr > 0) {
1844 int i;
1845
1846
1847
1848
1849
1850
1851 for (i = ns->level + 1; i <= pid->level; i++)
1852 seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
1853 }
1854#endif
1855 seq_putc(m, '\n');
1856}
1857#endif
1858
1859
1860
1861
1862static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
1863{
1864 struct pid *pid = file->private_data;
1865 __poll_t poll_flags = 0;
1866
1867 poll_wait(file, &pid->wait_pidfd, pts);
1868
1869
1870
1871
1872
1873
1874 if (thread_group_exited(pid))
1875 poll_flags = EPOLLIN | EPOLLRDNORM;
1876
1877 return poll_flags;
1878}
1879
1880const struct file_operations pidfd_fops = {
1881 .release = pidfd_release,
1882 .poll = pidfd_poll,
1883#ifdef CONFIG_PROC_FS
1884 .show_fdinfo = pidfd_show_fdinfo,
1885#endif
1886};
1887
1888static void __delayed_free_task(struct rcu_head *rhp)
1889{
1890 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
1891
1892 free_task(tsk);
1893}
1894
1895static __always_inline void delayed_free_task(struct task_struct *tsk)
1896{
1897 if (IS_ENABLED(CONFIG_MEMCG))
1898 call_rcu(&tsk->rcu, __delayed_free_task);
1899 else
1900 free_task(tsk);
1901}
1902
1903static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
1904{
1905
1906 if (!tsk->mm)
1907 return;
1908
1909
1910 if ((clone_flags & (CLONE_VM | CLONE_THREAD | CLONE_VFORK)) != CLONE_VM)
1911 return;
1912
1913
1914 mutex_lock(&oom_adj_mutex);
1915 set_bit(MMF_MULTIPROCESS, &tsk->mm->flags);
1916
1917 tsk->signal->oom_score_adj = current->signal->oom_score_adj;
1918 tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min;
1919 mutex_unlock(&oom_adj_mutex);
1920}
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930static __latent_entropy struct task_struct *copy_process(
1931 struct pid *pid,
1932 int trace,
1933 int node,
1934 struct kernel_clone_args *args)
1935{
1936 int pidfd = -1, retval;
1937 struct task_struct *p;
1938 struct multiprocess_signals delayed;
1939 struct file *pidfile = NULL;
1940 u64 clone_flags = args->flags;
1941 struct nsproxy *nsp = current->nsproxy;
1942
1943
1944
1945
1946
1947 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
1948 return ERR_PTR(-EINVAL);
1949
1950 if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
1951 return ERR_PTR(-EINVAL);
1952
1953
1954
1955
1956
1957 if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
1958 return ERR_PTR(-EINVAL);
1959
1960
1961
1962
1963
1964
1965 if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
1966 return ERR_PTR(-EINVAL);
1967
1968
1969
1970
1971
1972
1973
1974 if ((clone_flags & CLONE_PARENT) &&
1975 current->signal->flags & SIGNAL_UNKILLABLE)
1976 return ERR_PTR(-EINVAL);
1977
1978
1979
1980
1981
1982 if (clone_flags & CLONE_THREAD) {
1983 if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
1984 (task_active_pid_ns(current) != nsp->pid_ns_for_children))
1985 return ERR_PTR(-EINVAL);
1986 }
1987
1988
1989
1990
1991
1992 if (clone_flags & (CLONE_THREAD | CLONE_VM)) {
1993 if (nsp->time_ns != nsp->time_ns_for_children)
1994 return ERR_PTR(-EINVAL);
1995 }
1996
1997 if (clone_flags & CLONE_PIDFD) {
1998
1999
2000
2001
2002
2003 if (clone_flags & (CLONE_DETACHED | CLONE_THREAD))
2004 return ERR_PTR(-EINVAL);
2005 }
2006
2007
2008
2009
2010
2011
2012
2013 sigemptyset(&delayed.signal);
2014 INIT_HLIST_NODE(&delayed.node);
2015
2016 spin_lock_irq(¤t->sighand->siglock);
2017 if (!(clone_flags & CLONE_THREAD))
2018 hlist_add_head(&delayed.node, ¤t->signal->multiprocess);
2019 recalc_sigpending();
2020 spin_unlock_irq(¤t->sighand->siglock);
2021 retval = -ERESTARTNOINTR;
2022 if (task_sigpending(current))
2023 goto fork_out;
2024
2025 retval = -ENOMEM;
2026 p = dup_task_struct(current, node);
2027 if (!p)
2028 goto fork_out;
2029 if (args->io_thread) {
2030
2031
2032
2033
2034 p->flags |= PF_IO_WORKER;
2035 siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
2036 }
2037
2038
2039
2040
2041
2042
2043
2044 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
2045
2046
2047
2048 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;
2049
2050 ftrace_graph_init_task(p);
2051
2052 rt_mutex_init_task(p);
2053
2054 lockdep_assert_irqs_enabled();
2055#ifdef CONFIG_PROVE_LOCKING
2056 DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
2057#endif
2058 retval = -EAGAIN;
2059 if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
2060 if (p->real_cred->user != INIT_USER &&
2061 !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
2062 goto bad_fork_free;
2063 }
2064 current->flags &= ~PF_NPROC_EXCEEDED;
2065
2066 retval = copy_creds(p, clone_flags);
2067 if (retval < 0)
2068 goto bad_fork_free;
2069
2070
2071
2072
2073
2074
2075 retval = -EAGAIN;
2076 if (data_race(nr_threads >= max_threads))
2077 goto bad_fork_cleanup_count;
2078
2079 delayacct_tsk_init(p);
2080 p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE | PF_NO_SETAFFINITY);
2081 p->flags |= PF_FORKNOEXEC;
2082 INIT_LIST_HEAD(&p->children);
2083 INIT_LIST_HEAD(&p->sibling);
2084 rcu_copy_process(p);
2085 p->vfork_done = NULL;
2086 spin_lock_init(&p->alloc_lock);
2087
2088 init_sigpending(&p->pending);
2089
2090 p->utime = p->stime = p->gtime = 0;
2091#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
2092 p->utimescaled = p->stimescaled = 0;
2093#endif
2094 prev_cputime_init(&p->prev_cputime);
2095
2096#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
2097 seqcount_init(&p->vtime.seqcount);
2098 p->vtime.starttime = 0;
2099 p->vtime.state = VTIME_INACTIVE;
2100#endif
2101
2102#ifdef CONFIG_IO_URING
2103 p->io_uring = NULL;
2104#endif
2105
2106#if defined(SPLIT_RSS_COUNTING)
2107 memset(&p->rss_stat, 0, sizeof(p->rss_stat));
2108#endif
2109
2110 p->default_timer_slack_ns = current->timer_slack_ns;
2111
2112#ifdef CONFIG_PSI
2113 p->psi_flags = 0;
2114#endif
2115
2116 task_io_accounting_init(&p->ioac);
2117 acct_clear_integrals(p);
2118
2119 posix_cputimers_init(&p->posix_cputimers);
2120
2121 p->io_context = NULL;
2122 audit_set_context(p, NULL);
2123 cgroup_fork(p);
2124#ifdef CONFIG_NUMA
2125 p->mempolicy = mpol_dup(p->mempolicy);
2126 if (IS_ERR(p->mempolicy)) {
2127 retval = PTR_ERR(p->mempolicy);
2128 p->mempolicy = NULL;
2129 goto bad_fork_cleanup_threadgroup_lock;
2130 }
2131#endif
2132#ifdef CONFIG_CPUSETS
2133 p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
2134 p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
2135 seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
2136#endif
2137#ifdef CONFIG_TRACE_IRQFLAGS
2138 memset(&p->irqtrace, 0, sizeof(p->irqtrace));
2139 p->irqtrace.hardirq_disable_ip = _THIS_IP_;
2140 p->irqtrace.softirq_enable_ip = _THIS_IP_;
2141 p->softirqs_enabled = 1;
2142 p->softirq_context = 0;
2143#endif
2144
2145 p->pagefault_disabled = 0;
2146
2147#ifdef CONFIG_LOCKDEP
2148 lockdep_init_task(p);
2149#endif
2150
2151#ifdef CONFIG_DEBUG_MUTEXES
2152 p->blocked_on = NULL;
2153#endif
2154#ifdef CONFIG_BCACHE
2155 p->sequential_io = 0;
2156 p->sequential_io_avg = 0;
2157#endif
2158#ifdef CONFIG_BPF_SYSCALL
2159 RCU_INIT_POINTER(p->bpf_storage, NULL);
2160 p->bpf_ctx = NULL;
2161#endif
2162
2163
2164 retval = sched_fork(clone_flags, p);
2165 if (retval)
2166 goto bad_fork_cleanup_policy;
2167
2168 retval = perf_event_init_task(p, clone_flags);
2169 if (retval)
2170 goto bad_fork_cleanup_policy;
2171 retval = audit_alloc(p);
2172 if (retval)
2173 goto bad_fork_cleanup_perf;
2174
2175 shm_init_task(p);
2176 retval = security_task_alloc(p, clone_flags);
2177 if (retval)
2178 goto bad_fork_cleanup_audit;
2179 retval = copy_semundo(clone_flags, p);
2180 if (retval)
2181 goto bad_fork_cleanup_security;
2182 retval = copy_files(clone_flags, p);
2183 if (retval)
2184 goto bad_fork_cleanup_semundo;
2185 retval = copy_fs(clone_flags, p);
2186 if (retval)
2187 goto bad_fork_cleanup_files;
2188 retval = copy_sighand(clone_flags, p);
2189 if (retval)
2190 goto bad_fork_cleanup_fs;
2191 retval = copy_signal(clone_flags, p);
2192 if (retval)
2193 goto bad_fork_cleanup_sighand;
2194 retval = copy_mm(clone_flags, p);
2195 if (retval)
2196 goto bad_fork_cleanup_signal;
2197 retval = copy_namespaces(clone_flags, p);
2198 if (retval)
2199 goto bad_fork_cleanup_mm;
2200 retval = copy_io(clone_flags, p);
2201 if (retval)
2202 goto bad_fork_cleanup_namespaces;
2203 retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls);
2204 if (retval)
2205 goto bad_fork_cleanup_io;
2206
2207 stackleak_task_init(p);
2208
2209 if (pid != &init_struct_pid) {
2210 pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
2211 args->set_tid_size);
2212 if (IS_ERR(pid)) {
2213 retval = PTR_ERR(pid);
2214 goto bad_fork_cleanup_thread;
2215 }
2216 }
2217
2218
2219
2220
2221
2222
2223 if (clone_flags & CLONE_PIDFD) {
2224 retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
2225 if (retval < 0)
2226 goto bad_fork_free_pid;
2227
2228 pidfd = retval;
2229
2230 pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
2231 O_RDWR | O_CLOEXEC);
2232 if (IS_ERR(pidfile)) {
2233 put_unused_fd(pidfd);
2234 retval = PTR_ERR(pidfile);
2235 goto bad_fork_free_pid;
2236 }
2237 get_pid(pid);
2238
2239 retval = put_user(pidfd, args->pidfd);
2240 if (retval)
2241 goto bad_fork_put_pidfd;
2242 }
2243
2244#ifdef CONFIG_BLOCK
2245 p->plug = NULL;
2246#endif
2247 futex_init_task(p);
2248
2249
2250
2251
2252 if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
2253 sas_ss_reset(p);
2254
2255
2256
2257
2258
2259 user_disable_single_step(p);
2260 clear_task_syscall_work(p, SYSCALL_TRACE);
2261#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
2262 clear_task_syscall_work(p, SYSCALL_EMU);
2263#endif
2264 clear_tsk_latency_tracing(p);
2265
2266
2267 p->pid = pid_nr(pid);
2268 if (clone_flags & CLONE_THREAD) {
2269 p->group_leader = current->group_leader;
2270 p->tgid = current->tgid;
2271 } else {
2272 p->group_leader = p;
2273 p->tgid = p->pid;
2274 }
2275
2276 p->nr_dirtied = 0;
2277 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
2278 p->dirty_paused_when = 0;
2279
2280 p->pdeath_signal = 0;
2281 INIT_LIST_HEAD(&p->thread_group);
2282 p->task_works = NULL;
2283
2284#ifdef CONFIG_KRETPROBES
2285 p->kretprobe_instances.first = NULL;
2286#endif
2287
2288
2289
2290
2291
2292
2293
2294 retval = cgroup_can_fork(p, args);
2295 if (retval)
2296 goto bad_fork_put_pidfd;
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306 p->start_time = ktime_get_ns();
2307 p->start_boottime = ktime_get_boottime_ns();
2308
2309
2310
2311
2312
2313 write_lock_irq(&tasklist_lock);
2314
2315
2316 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
2317 p->real_parent = current->real_parent;
2318 p->parent_exec_id = current->parent_exec_id;
2319 if (clone_flags & CLONE_THREAD)
2320 p->exit_signal = -1;
2321 else
2322 p->exit_signal = current->group_leader->exit_signal;
2323 } else {
2324 p->real_parent = current;
2325 p->parent_exec_id = current->self_exec_id;
2326 p->exit_signal = args->exit_signal;
2327 }
2328
2329 klp_copy_process(p);
2330
2331 sched_core_fork(p);
2332
2333 spin_lock(¤t->sighand->siglock);
2334
2335
2336
2337
2338
2339 copy_seccomp(p);
2340
2341 rseq_fork(p, clone_flags);
2342
2343
2344 if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
2345 retval = -ENOMEM;
2346 goto bad_fork_cancel_cgroup;
2347 }
2348
2349
2350 if (fatal_signal_pending(current)) {
2351 retval = -EINTR;
2352 goto bad_fork_cancel_cgroup;
2353 }
2354
2355
2356 if (pidfile)
2357 fd_install(pidfd, pidfile);
2358
2359 init_task_pid_links(p);
2360 if (likely(p->pid)) {
2361 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
2362
2363 init_task_pid(p, PIDTYPE_PID, pid);
2364 if (thread_group_leader(p)) {
2365 init_task_pid(p, PIDTYPE_TGID, pid);
2366 init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
2367 init_task_pid(p, PIDTYPE_SID, task_session(current));
2368
2369 if (is_child_reaper(pid)) {
2370 ns_of_pid(pid)->child_reaper = p;
2371 p->signal->flags |= SIGNAL_UNKILLABLE;
2372 }
2373 p->signal->shared_pending.signal = delayed.signal;
2374 p->signal->tty = tty_kref_get(current->signal->tty);
2375
2376
2377
2378
2379
2380 p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
2381 p->real_parent->signal->is_child_subreaper;
2382 list_add_tail(&p->sibling, &p->real_parent->children);
2383 list_add_tail_rcu(&p->tasks, &init_task.tasks);
2384 attach_pid(p, PIDTYPE_TGID);
2385 attach_pid(p, PIDTYPE_PGID);
2386 attach_pid(p, PIDTYPE_SID);
2387 __this_cpu_inc(process_counts);
2388 } else {
2389 current->signal->nr_threads++;
2390 atomic_inc(¤t->signal->live);
2391 refcount_inc(¤t->signal->sigcnt);
2392 task_join_group_stop(p);
2393 list_add_tail_rcu(&p->thread_group,
2394 &p->group_leader->thread_group);
2395 list_add_tail_rcu(&p->thread_node,
2396 &p->signal->thread_head);
2397 }
2398 attach_pid(p, PIDTYPE_PID);
2399 nr_threads++;
2400 }
2401 total_forks++;
2402 hlist_del_init(&delayed.node);
2403 spin_unlock(¤t->sighand->siglock);
2404 syscall_tracepoint_update(p);
2405 write_unlock_irq(&tasklist_lock);
2406
2407 proc_fork_connector(p);
2408 sched_post_fork(p);
2409 cgroup_post_fork(p, args);
2410 perf_event_fork(p);
2411
2412 trace_task_newtask(p, clone_flags);
2413 uprobe_copy_process(p, clone_flags);
2414
2415 copy_oom_score_adj(clone_flags, p);
2416
2417 return p;
2418
2419bad_fork_cancel_cgroup:
2420 sched_core_free(p);
2421 spin_unlock(¤t->sighand->siglock);
2422 write_unlock_irq(&tasklist_lock);
2423 cgroup_cancel_fork(p, args);
2424bad_fork_put_pidfd:
2425 if (clone_flags & CLONE_PIDFD) {
2426 fput(pidfile);
2427 put_unused_fd(pidfd);
2428 }
2429bad_fork_free_pid:
2430 if (pid != &init_struct_pid)
2431 free_pid(pid);
2432bad_fork_cleanup_thread:
2433 exit_thread(p);
2434bad_fork_cleanup_io:
2435 if (p->io_context)
2436 exit_io_context(p);
2437bad_fork_cleanup_namespaces:
2438 exit_task_namespaces(p);
2439bad_fork_cleanup_mm:
2440 if (p->mm) {
2441 mm_clear_owner(p->mm, p);
2442 mmput(p->mm);
2443 }
2444bad_fork_cleanup_signal:
2445 if (!(clone_flags & CLONE_THREAD))
2446 free_signal_struct(p->signal);
2447bad_fork_cleanup_sighand:
2448 __cleanup_sighand(p->sighand);
2449bad_fork_cleanup_fs:
2450 exit_fs(p);
2451bad_fork_cleanup_files:
2452 exit_files(p);
2453bad_fork_cleanup_semundo:
2454 exit_sem(p);
2455bad_fork_cleanup_security:
2456 security_task_free(p);
2457bad_fork_cleanup_audit:
2458 audit_free(p);
2459bad_fork_cleanup_perf:
2460 perf_event_free_task(p);
2461bad_fork_cleanup_policy:
2462 lockdep_free_task(p);
2463#ifdef CONFIG_NUMA
2464 mpol_put(p->mempolicy);
2465bad_fork_cleanup_threadgroup_lock:
2466#endif
2467 delayacct_tsk_free(p);
2468bad_fork_cleanup_count:
2469 dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
2470 exit_creds(p);
2471bad_fork_free:
2472 WRITE_ONCE(p->__state, TASK_DEAD);
2473 put_task_stack(p);
2474 delayed_free_task(p);
2475fork_out:
2476 spin_lock_irq(¤t->sighand->siglock);
2477 hlist_del_init(&delayed.node);
2478 spin_unlock_irq(¤t->sighand->siglock);
2479 return ERR_PTR(retval);
2480}
2481
2482static inline void init_idle_pids(struct task_struct *idle)
2483{
2484 enum pid_type type;
2485
2486 for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
2487 INIT_HLIST_NODE(&idle->pid_links[type]);
2488 init_task_pid(idle, type, &init_struct_pid);
2489 }
2490}
2491
2492struct task_struct * __init fork_idle(int cpu)
2493{
2494 struct task_struct *task;
2495 struct kernel_clone_args args = {
2496 .flags = CLONE_VM,
2497 };
2498
2499 task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
2500 if (!IS_ERR(task)) {
2501 init_idle_pids(task);
2502 init_idle(task, cpu);
2503 }
2504
2505 return task;
2506}
2507
2508struct mm_struct *copy_init_mm(void)
2509{
2510 return dup_mm(NULL, &init_mm);
2511}
2512
2513
2514
2515
2516
2517
2518
2519struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
2520{
2521 unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
2522 CLONE_IO;
2523 struct kernel_clone_args args = {
2524 .flags = ((lower_32_bits(flags) | CLONE_VM |
2525 CLONE_UNTRACED) & ~CSIGNAL),
2526 .exit_signal = (lower_32_bits(flags) & CSIGNAL),
2527 .stack = (unsigned long)fn,
2528 .stack_size = (unsigned long)arg,
2529 .io_thread = 1,
2530 };
2531
2532 return copy_process(NULL, 0, node, &args);
2533}
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543pid_t kernel_clone(struct kernel_clone_args *args)
2544{
2545 u64 clone_flags = args->flags;
2546 struct completion vfork;
2547 struct pid *pid;
2548 struct task_struct *p;
2549 int trace = 0;
2550 pid_t nr;
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561 if ((args->flags & CLONE_PIDFD) &&
2562 (args->flags & CLONE_PARENT_SETTID) &&
2563 (args->pidfd == args->parent_tid))
2564 return -EINVAL;
2565
2566
2567
2568
2569
2570
2571
2572 if (!(clone_flags & CLONE_UNTRACED)) {
2573 if (clone_flags & CLONE_VFORK)
2574 trace = PTRACE_EVENT_VFORK;
2575 else if (args->exit_signal != SIGCHLD)
2576 trace = PTRACE_EVENT_CLONE;
2577 else
2578 trace = PTRACE_EVENT_FORK;
2579
2580 if (likely(!ptrace_event_enabled(current, trace)))
2581 trace = 0;
2582 }
2583
2584 p = copy_process(NULL, trace, NUMA_NO_NODE, args);
2585 add_latent_entropy();
2586
2587 if (IS_ERR(p))
2588 return PTR_ERR(p);
2589
2590
2591
2592
2593
2594 trace_sched_process_fork(current, p);
2595
2596 pid = get_task_pid(p, PIDTYPE_PID);
2597 nr = pid_vnr(pid);
2598
2599 if (clone_flags & CLONE_PARENT_SETTID)
2600 put_user(nr, args->parent_tid);
2601
2602 if (clone_flags & CLONE_VFORK) {
2603 p->vfork_done = &vfork;
2604 init_completion(&vfork);
2605 get_task_struct(p);
2606 }
2607
2608 wake_up_new_task(p);
2609
2610
2611 if (unlikely(trace))
2612 ptrace_event_pid(trace, pid);
2613
2614 if (clone_flags & CLONE_VFORK) {
2615 if (!wait_for_vfork_done(p, &vfork))
2616 ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
2617 }
2618
2619 put_pid(pid);
2620 return nr;
2621}
2622
2623
2624
2625
2626pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
2627{
2628 struct kernel_clone_args args = {
2629 .flags = ((lower_32_bits(flags) | CLONE_VM |
2630 CLONE_UNTRACED) & ~CSIGNAL),
2631 .exit_signal = (lower_32_bits(flags) & CSIGNAL),
2632 .stack = (unsigned long)fn,
2633 .stack_size = (unsigned long)arg,
2634 };
2635
2636 return kernel_clone(&args);
2637}
2638
2639#ifdef __ARCH_WANT_SYS_FORK
2640SYSCALL_DEFINE0(fork)
2641{
2642#ifdef CONFIG_MMU
2643 struct kernel_clone_args args = {
2644 .exit_signal = SIGCHLD,
2645 };
2646
2647 return kernel_clone(&args);
2648#else
2649
2650 return -EINVAL;
2651#endif
2652}
2653#endif
2654
2655#ifdef __ARCH_WANT_SYS_VFORK
2656SYSCALL_DEFINE0(vfork)
2657{
2658 struct kernel_clone_args args = {
2659 .flags = CLONE_VFORK | CLONE_VM,
2660 .exit_signal = SIGCHLD,
2661 };
2662
2663 return kernel_clone(&args);
2664}
2665#endif
2666
2667#ifdef __ARCH_WANT_SYS_CLONE
2668#ifdef CONFIG_CLONE_BACKWARDS
2669SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
2670 int __user *, parent_tidptr,
2671 unsigned long, tls,
2672 int __user *, child_tidptr)
2673#elif defined(CONFIG_CLONE_BACKWARDS2)
2674SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
2675 int __user *, parent_tidptr,
2676 int __user *, child_tidptr,
2677 unsigned long, tls)
2678#elif defined(CONFIG_CLONE_BACKWARDS3)
2679SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
2680 int, stack_size,
2681 int __user *, parent_tidptr,
2682 int __user *, child_tidptr,
2683 unsigned long, tls)
2684#else
2685SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
2686 int __user *, parent_tidptr,
2687 int __user *, child_tidptr,
2688 unsigned long, tls)
2689#endif
2690{
2691 struct kernel_clone_args args = {
2692 .flags = (lower_32_bits(clone_flags) & ~CSIGNAL),
2693 .pidfd = parent_tidptr,
2694 .child_tid = child_tidptr,
2695 .parent_tid = parent_tidptr,
2696 .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL),
2697 .stack = newsp,
2698 .tls = tls,
2699 };
2700
2701 return kernel_clone(&args);
2702}
2703#endif
2704
2705#ifdef __ARCH_WANT_SYS_CLONE3
2706
2707noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
2708 struct clone_args __user *uargs,
2709 size_t usize)
2710{
2711 int err;
2712 struct clone_args args;
2713 pid_t *kset_tid = kargs->set_tid;
2714
2715 BUILD_BUG_ON(offsetofend(struct clone_args, tls) !=
2716 CLONE_ARGS_SIZE_VER0);
2717 BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) !=
2718 CLONE_ARGS_SIZE_VER1);
2719 BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
2720 CLONE_ARGS_SIZE_VER2);
2721 BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);
2722
2723 if (unlikely(usize > PAGE_SIZE))
2724 return -E2BIG;
2725 if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
2726 return -EINVAL;
2727
2728 err = copy_struct_from_user(&args, sizeof(args), uargs, usize);
2729 if (err)
2730 return err;
2731
2732 if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
2733 return -EINVAL;
2734
2735 if (unlikely(!args.set_tid && args.set_tid_size > 0))
2736 return -EINVAL;
2737
2738 if (unlikely(args.set_tid && args.set_tid_size == 0))
2739 return -EINVAL;
2740
2741
2742
2743
2744
2745 if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
2746 !valid_signal(args.exit_signal)))
2747 return -EINVAL;
2748
2749 if ((args.flags & CLONE_INTO_CGROUP) &&
2750 (args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2))
2751 return -EINVAL;
2752
2753 *kargs = (struct kernel_clone_args){
2754 .flags = args.flags,
2755 .pidfd = u64_to_user_ptr(args.pidfd),
2756 .child_tid = u64_to_user_ptr(args.child_tid),
2757 .parent_tid = u64_to_user_ptr(args.parent_tid),
2758 .exit_signal = args.exit_signal,
2759 .stack = args.stack,
2760 .stack_size = args.stack_size,
2761 .tls = args.tls,
2762 .set_tid_size = args.set_tid_size,
2763 .cgroup = args.cgroup,
2764 };
2765
2766 if (args.set_tid &&
2767 copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
2768 (kargs->set_tid_size * sizeof(pid_t))))
2769 return -EFAULT;
2770
2771 kargs->set_tid = kset_tid;
2772
2773 return 0;
2774}
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
2785{
2786 if (kargs->stack == 0) {
2787 if (kargs->stack_size > 0)
2788 return false;
2789 } else {
2790 if (kargs->stack_size == 0)
2791 return false;
2792
2793 if (!access_ok((void __user *)kargs->stack, kargs->stack_size))
2794 return false;
2795
2796#if !defined(CONFIG_STACK_GROWSUP) && !defined(CONFIG_IA64)
2797 kargs->stack += kargs->stack_size;
2798#endif
2799 }
2800
2801 return true;
2802}
2803
2804static bool clone3_args_valid(struct kernel_clone_args *kargs)
2805{
2806
2807 if (kargs->flags &
2808 ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
2809 return false;
2810
2811
2812
2813
2814
2815 if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
2816 return false;
2817
2818 if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) ==
2819 (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND))
2820 return false;
2821
2822 if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
2823 kargs->exit_signal)
2824 return false;
2825
2826 if (!clone3_stack_valid(kargs))
2827 return false;
2828
2829 return true;
2830}
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
2844{
2845 int err;
2846
2847 struct kernel_clone_args kargs;
2848 pid_t set_tid[MAX_PID_NS_LEVEL];
2849
2850 kargs.set_tid = set_tid;
2851
2852 err = copy_clone_args_from_user(&kargs, uargs, size);
2853 if (err)
2854 return err;
2855
2856 if (!clone3_args_valid(&kargs))
2857 return -EINVAL;
2858
2859 return kernel_clone(&kargs);
2860}
2861#endif
2862
2863void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data)
2864{
2865 struct task_struct *leader, *parent, *child;
2866 int res;
2867
2868 read_lock(&tasklist_lock);
2869 leader = top = top->group_leader;
2870down:
2871 for_each_thread(leader, parent) {
2872 list_for_each_entry(child, &parent->children, sibling) {
2873 res = visitor(child, data);
2874 if (res) {
2875 if (res < 0)
2876 goto out;
2877 leader = child;
2878 goto down;
2879 }
2880up:
2881 ;
2882 }
2883 }
2884
2885 if (leader != top) {
2886 child = leader;
2887 parent = child->real_parent;
2888 leader = parent->group_leader;
2889 goto up;
2890 }
2891out:
2892 read_unlock(&tasklist_lock);
2893}
2894
2895#ifndef ARCH_MIN_MMSTRUCT_ALIGN
2896#define ARCH_MIN_MMSTRUCT_ALIGN 0
2897#endif
2898
2899static void sighand_ctor(void *data)
2900{
2901 struct sighand_struct *sighand = data;
2902
2903 spin_lock_init(&sighand->siglock);
2904 init_waitqueue_head(&sighand->signalfd_wqh);
2905}
2906
2907void __init proc_caches_init(void)
2908{
2909 unsigned int mm_size;
2910
2911 sighand_cachep = kmem_cache_create("sighand_cache",
2912 sizeof(struct sighand_struct), 0,
2913 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
2914 SLAB_ACCOUNT, sighand_ctor);
2915 signal_cachep = kmem_cache_create("signal_cache",
2916 sizeof(struct signal_struct), 0,
2917 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
2918 NULL);
2919 files_cachep = kmem_cache_create("files_cache",
2920 sizeof(struct files_struct), 0,
2921 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
2922 NULL);
2923 fs_cachep = kmem_cache_create("fs_cache",
2924 sizeof(struct fs_struct), 0,
2925 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
2926 NULL);
2927
2928
2929
2930
2931
2932
2933 mm_size = sizeof(struct mm_struct) + cpumask_size();
2934
2935 mm_cachep = kmem_cache_create_usercopy("mm_struct",
2936 mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
2937 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
2938 offsetof(struct mm_struct, saved_auxv),
2939 sizeof_field(struct mm_struct, saved_auxv),
2940 NULL);
2941 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
2942 mmap_init();
2943 nsproxy_cache_init();
2944}
2945
2946
2947
2948
2949static int check_unshare_flags(unsigned long unshare_flags)
2950{
2951 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
2952 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
2953 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
2954 CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
2955 CLONE_NEWTIME))
2956 return -EINVAL;
2957
2958
2959
2960
2961
2962
2963 if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
2964 if (!thread_group_empty(current))
2965 return -EINVAL;
2966 }
2967 if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
2968 if (refcount_read(¤t->sighand->count) > 1)
2969 return -EINVAL;
2970 }
2971 if (unshare_flags & CLONE_VM) {
2972 if (!current_is_single_threaded())
2973 return -EINVAL;
2974 }
2975
2976 return 0;
2977}
2978
2979
2980
2981
2982static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
2983{
2984 struct fs_struct *fs = current->fs;
2985
2986 if (!(unshare_flags & CLONE_FS) || !fs)
2987 return 0;
2988
2989
2990 if (fs->users == 1)
2991 return 0;
2992
2993 *new_fsp = copy_fs_struct(fs);
2994 if (!*new_fsp)
2995 return -ENOMEM;
2996
2997 return 0;
2998}
2999
3000
3001
3002
3003int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
3004 struct files_struct **new_fdp)
3005{
3006 struct files_struct *fd = current->files;
3007 int error = 0;
3008
3009 if ((unshare_flags & CLONE_FILES) &&
3010 (fd && atomic_read(&fd->count) > 1)) {
3011 *new_fdp = dup_fd(fd, max_fds, &error);
3012 if (!*new_fdp)
3013 return error;
3014 }
3015
3016 return 0;
3017}
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027int ksys_unshare(unsigned long unshare_flags)
3028{
3029 struct fs_struct *fs, *new_fs = NULL;
3030 struct files_struct *fd, *new_fd = NULL;
3031 struct cred *new_cred = NULL;
3032 struct nsproxy *new_nsproxy = NULL;
3033 int do_sysvsem = 0;
3034 int err;
3035
3036
3037
3038
3039
3040 if (unshare_flags & CLONE_NEWUSER)
3041 unshare_flags |= CLONE_THREAD | CLONE_FS;
3042
3043
3044
3045 if (unshare_flags & CLONE_VM)
3046 unshare_flags |= CLONE_SIGHAND;
3047
3048
3049
3050 if (unshare_flags & CLONE_SIGHAND)
3051 unshare_flags |= CLONE_THREAD;
3052
3053
3054
3055 if (unshare_flags & CLONE_NEWNS)
3056 unshare_flags |= CLONE_FS;
3057
3058 err = check_unshare_flags(unshare_flags);
3059 if (err)
3060 goto bad_unshare_out;
3061
3062
3063
3064
3065
3066 if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
3067 do_sysvsem = 1;
3068 err = unshare_fs(unshare_flags, &new_fs);
3069 if (err)
3070 goto bad_unshare_out;
3071 err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd);
3072 if (err)
3073 goto bad_unshare_cleanup_fs;
3074 err = unshare_userns(unshare_flags, &new_cred);
3075 if (err)
3076 goto bad_unshare_cleanup_fd;
3077 err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
3078 new_cred, new_fs);
3079 if (err)
3080 goto bad_unshare_cleanup_cred;
3081
3082 if (new_cred) {
3083 err = set_cred_ucounts(new_cred);
3084 if (err)
3085 goto bad_unshare_cleanup_cred;
3086 }
3087
3088 if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
3089 if (do_sysvsem) {
3090
3091
3092
3093 exit_sem(current);
3094 }
3095 if (unshare_flags & CLONE_NEWIPC) {
3096
3097 exit_shm(current);
3098 shm_init_task(current);
3099 }
3100
3101 if (new_nsproxy)
3102 switch_task_namespaces(current, new_nsproxy);
3103
3104 task_lock(current);
3105
3106 if (new_fs) {
3107 fs = current->fs;
3108 spin_lock(&fs->lock);
3109 current->fs = new_fs;
3110 if (--fs->users)
3111 new_fs = NULL;
3112 else
3113 new_fs = fs;
3114 spin_unlock(&fs->lock);
3115 }
3116
3117 if (new_fd) {
3118 fd = current->files;
3119 current->files = new_fd;
3120 new_fd = fd;
3121 }
3122
3123 task_unlock(current);
3124
3125 if (new_cred) {
3126
3127 commit_creds(new_cred);
3128 new_cred = NULL;
3129 }
3130 }
3131
3132 perf_event_namespaces(current);
3133
3134bad_unshare_cleanup_cred:
3135 if (new_cred)
3136 put_cred(new_cred);
3137bad_unshare_cleanup_fd:
3138 if (new_fd)
3139 put_files_struct(new_fd);
3140
3141bad_unshare_cleanup_fs:
3142 if (new_fs)
3143 free_fs_struct(new_fs);
3144
3145bad_unshare_out:
3146 return err;
3147}
3148
3149SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
3150{
3151 return ksys_unshare(unshare_flags);
3152}
3153
3154
3155
3156
3157
3158
3159
3160int unshare_files(void)
3161{
3162 struct task_struct *task = current;
3163 struct files_struct *old, *copy = NULL;
3164 int error;
3165
3166 error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, ©);
3167 if (error || !copy)
3168 return error;
3169
3170 old = task->files;
3171 task_lock(task);
3172 task->files = copy;
3173 task_unlock(task);
3174 put_files_struct(old);
3175 return 0;
3176}
3177
3178int sysctl_max_threads(struct ctl_table *table, int write,
3179 void *buffer, size_t *lenp, loff_t *ppos)
3180{
3181 struct ctl_table t;
3182 int ret;
3183 int threads = max_threads;
3184 int min = 1;
3185 int max = MAX_THREADS;
3186
3187 t = *table;
3188 t.data = &threads;
3189 t.extra1 = &min;
3190 t.extra2 = &max;
3191
3192 ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
3193 if (ret || !write)
3194 return ret;
3195
3196 max_threads = threads;
3197
3198 return 0;
3199}
3200