1
2
3
4
5
6
7
8
9
10
11
12
13
14
15#include <linux/anon_inodes.h>
16#include <linux/slab.h>
17#include <linux/sched/autogroup.h>
18#include <linux/sched/mm.h>
19#include <linux/sched/coredump.h>
20#include <linux/sched/user.h>
21#include <linux/sched/numa_balancing.h>
22#include <linux/sched/stat.h>
23#include <linux/sched/task.h>
24#include <linux/sched/task_stack.h>
25#include <linux/sched/cputime.h>
26#include <linux/seq_file.h>
27#include <linux/rtmutex.h>
28#include <linux/init.h>
29#include <linux/unistd.h>
30#include <linux/module.h>
31#include <linux/vmalloc.h>
32#include <linux/completion.h>
33#include <linux/personality.h>
34#include <linux/mempolicy.h>
35#include <linux/sem.h>
36#include <linux/file.h>
37#include <linux/fdtable.h>
38#include <linux/iocontext.h>
39#include <linux/key.h>
40#include <linux/binfmts.h>
41#include <linux/mman.h>
42#include <linux/mmu_notifier.h>
43#include <linux/fs.h>
44#include <linux/mm.h>
45#include <linux/vmacache.h>
46#include <linux/nsproxy.h>
47#include <linux/capability.h>
48#include <linux/cpu.h>
49#include <linux/cgroup.h>
50#include <linux/security.h>
51#include <linux/hugetlb.h>
52#include <linux/seccomp.h>
53#include <linux/swap.h>
54#include <linux/syscalls.h>
55#include <linux/jiffies.h>
56#include <linux/futex.h>
57#include <linux/compat.h>
58#include <linux/kthread.h>
59#include <linux/task_io_accounting_ops.h>
60#include <linux/rcupdate.h>
61#include <linux/ptrace.h>
62#include <linux/mount.h>
63#include <linux/audit.h>
64#include <linux/memcontrol.h>
65#include <linux/ftrace.h>
66#include <linux/proc_fs.h>
67#include <linux/profile.h>
68#include <linux/rmap.h>
69#include <linux/ksm.h>
70#include <linux/acct.h>
71#include <linux/userfaultfd_k.h>
72#include <linux/tsacct_kern.h>
73#include <linux/cn_proc.h>
74#include <linux/freezer.h>
75#include <linux/delayacct.h>
76#include <linux/taskstats_kern.h>
77#include <linux/random.h>
78#include <linux/tty.h>
79#include <linux/blkdev.h>
80#include <linux/fs_struct.h>
81#include <linux/magic.h>
82#include <linux/perf_event.h>
83#include <linux/posix-timers.h>
84#include <linux/user-return-notifier.h>
85#include <linux/oom.h>
86#include <linux/khugepaged.h>
87#include <linux/signalfd.h>
88#include <linux/uprobes.h>
89#include <linux/aio.h>
90#include <linux/compiler.h>
91#include <linux/sysctl.h>
92#include <linux/kcov.h>
93#include <linux/livepatch.h>
94#include <linux/thread_info.h>
95#include <linux/stackleak.h>
96#include <linux/kasan.h>
97#include <linux/scs.h>
98#include <linux/io_uring.h>
99#include <linux/bpf.h>
100
101#include <asm/pgalloc.h>
102#include <linux/uaccess.h>
103#include <asm/mmu_context.h>
104#include <asm/cacheflush.h>
105#include <asm/tlbflush.h>
106
107#include <trace/events/sched.h>
108
109#define CREATE_TRACE_POINTS
110#include <trace/events/task.h>
111
112
113
114
115#define MIN_THREADS 20
116
117
118
119
120#define MAX_THREADS FUTEX_TID_MASK
121
122
123
124
125unsigned long total_forks;
126int nr_threads;
127
128static int max_threads;
129
130#define NAMED_ARRAY_INDEX(x) [x] = __stringify(x)
131
132static const char * const resident_page_types[] = {
133 NAMED_ARRAY_INDEX(MM_FILEPAGES),
134 NAMED_ARRAY_INDEX(MM_ANONPAGES),
135 NAMED_ARRAY_INDEX(MM_SWAPENTS),
136 NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
137};
138
139DEFINE_PER_CPU(unsigned long, process_counts) = 0;
140
141__cacheline_aligned DEFINE_RWLOCK(tasklist_lock);
142
143#ifdef CONFIG_PROVE_RCU
144int lockdep_tasklist_lock_is_held(void)
145{
146 return lockdep_is_held(&tasklist_lock);
147}
148EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
149#endif
150
151int nr_processes(void)
152{
153 int cpu;
154 int total = 0;
155
156 for_each_possible_cpu(cpu)
157 total += per_cpu(process_counts, cpu);
158
159 return total;
160}
161
162void __weak arch_release_task_struct(struct task_struct *tsk)
163{
164}
165
166#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
167static struct kmem_cache *task_struct_cachep;
168
169static inline struct task_struct *alloc_task_struct_node(int node)
170{
171 return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
172}
173
174static inline void free_task_struct(struct task_struct *tsk)
175{
176 kmem_cache_free(task_struct_cachep, tsk);
177}
178#endif
179
180#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
181
182
183
184
185
186# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
187
188#ifdef CONFIG_VMAP_STACK
189
190
191
192
193#define NR_CACHED_STACKS 2
194static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
195
196static int free_vm_stack_cache(unsigned int cpu)
197{
198 struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
199 int i;
200
201 for (i = 0; i < NR_CACHED_STACKS; i++) {
202 struct vm_struct *vm_stack = cached_vm_stacks[i];
203
204 if (!vm_stack)
205 continue;
206
207 vfree(vm_stack->addr);
208 cached_vm_stacks[i] = NULL;
209 }
210
211 return 0;
212}
213#endif
214
215static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
216{
217#ifdef CONFIG_VMAP_STACK
218 void *stack;
219 int i;
220
221 for (i = 0; i < NR_CACHED_STACKS; i++) {
222 struct vm_struct *s;
223
224 s = this_cpu_xchg(cached_stacks[i], NULL);
225
226 if (!s)
227 continue;
228
229
230 kasan_unpoison_range(s->addr, THREAD_SIZE);
231
232
233 memset(s->addr, 0, THREAD_SIZE);
234
235 tsk->stack_vm_area = s;
236 tsk->stack = s->addr;
237 return s->addr;
238 }
239
240
241
242
243
244
245 stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
246 VMALLOC_START, VMALLOC_END,
247 THREADINFO_GFP & ~__GFP_ACCOUNT,
248 PAGE_KERNEL,
249 0, node, __builtin_return_address(0));
250
251
252
253
254
255
256 if (stack) {
257 tsk->stack_vm_area = find_vm_area(stack);
258 tsk->stack = stack;
259 }
260 return stack;
261#else
262 struct page *page = alloc_pages_node(node, THREADINFO_GFP,
263 THREAD_SIZE_ORDER);
264
265 if (likely(page)) {
266 tsk->stack = kasan_reset_tag(page_address(page));
267 return tsk->stack;
268 }
269 return NULL;
270#endif
271}
272
273static inline void free_thread_stack(struct task_struct *tsk)
274{
275#ifdef CONFIG_VMAP_STACK
276 struct vm_struct *vm = task_stack_vm_area(tsk);
277
278 if (vm) {
279 int i;
280
281 for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
282 memcg_kmem_uncharge_page(vm->pages[i], 0);
283
284 for (i = 0; i < NR_CACHED_STACKS; i++) {
285 if (this_cpu_cmpxchg(cached_stacks[i],
286 NULL, tsk->stack_vm_area) != NULL)
287 continue;
288
289 return;
290 }
291
292 vfree_atomic(tsk->stack);
293 return;
294 }
295#endif
296
297 __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
298}
299# else
300static struct kmem_cache *thread_stack_cache;
301
302static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
303 int node)
304{
305 unsigned long *stack;
306 stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
307 stack = kasan_reset_tag(stack);
308 tsk->stack = stack;
309 return stack;
310}
311
312static void free_thread_stack(struct task_struct *tsk)
313{
314 kmem_cache_free(thread_stack_cache, tsk->stack);
315}
316
317void thread_stack_cache_init(void)
318{
319 thread_stack_cache = kmem_cache_create_usercopy("thread_stack",
320 THREAD_SIZE, THREAD_SIZE, 0, 0,
321 THREAD_SIZE, NULL);
322 BUG_ON(thread_stack_cache == NULL);
323}
324# endif
325#endif
326
327
328static struct kmem_cache *signal_cachep;
329
330
331struct kmem_cache *sighand_cachep;
332
333
334struct kmem_cache *files_cachep;
335
336
337struct kmem_cache *fs_cachep;
338
339
340static struct kmem_cache *vm_area_cachep;
341
342
343static struct kmem_cache *mm_cachep;
344
345struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
346{
347 struct vm_area_struct *vma;
348
349 vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
350 if (vma)
351 vma_init(vma, mm);
352 return vma;
353}
354
355struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
356{
357 struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
358
359 if (new) {
360 ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
361 ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
362
363
364
365
366 *new = data_race(*orig);
367 INIT_LIST_HEAD(&new->anon_vma_chain);
368 new->vm_next = new->vm_prev = NULL;
369 }
370 return new;
371}
372
373void vm_area_free(struct vm_area_struct *vma)
374{
375 kmem_cache_free(vm_area_cachep, vma);
376}
377
378static void account_kernel_stack(struct task_struct *tsk, int account)
379{
380 void *stack = task_stack_page(tsk);
381 struct vm_struct *vm = task_stack_vm_area(tsk);
382
383 if (vm) {
384 int i;
385
386 for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
387 mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB,
388 account * (PAGE_SIZE / 1024));
389 } else {
390
391 mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB,
392 account * (THREAD_SIZE / 1024));
393 }
394}
395
396static int memcg_charge_kernel_stack(struct task_struct *tsk)
397{
398#ifdef CONFIG_VMAP_STACK
399 struct vm_struct *vm = task_stack_vm_area(tsk);
400 int ret;
401
402 BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
403
404 if (vm) {
405 int i;
406
407 BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
408
409 for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
410
411
412
413
414
415
416 ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
417 0);
418 if (ret)
419 return ret;
420 }
421 }
422#endif
423 return 0;
424}
425
426static void release_task_stack(struct task_struct *tsk)
427{
428 if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
429 return;
430
431 account_kernel_stack(tsk, -1);
432 free_thread_stack(tsk);
433 tsk->stack = NULL;
434#ifdef CONFIG_VMAP_STACK
435 tsk->stack_vm_area = NULL;
436#endif
437}
438
439#ifdef CONFIG_THREAD_INFO_IN_TASK
440void put_task_stack(struct task_struct *tsk)
441{
442 if (refcount_dec_and_test(&tsk->stack_refcount))
443 release_task_stack(tsk);
444}
445#endif
446
447void free_task(struct task_struct *tsk)
448{
449 scs_release(tsk);
450
451#ifndef CONFIG_THREAD_INFO_IN_TASK
452
453
454
455
456 release_task_stack(tsk);
457#else
458
459
460
461
462 WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
463#endif
464 rt_mutex_debug_task_free(tsk);
465 ftrace_graph_exit_task(tsk);
466 arch_release_task_struct(tsk);
467 if (tsk->flags & PF_KTHREAD)
468 free_kthread_struct(tsk);
469 free_task_struct(tsk);
470}
471EXPORT_SYMBOL(free_task);
472
473#ifdef CONFIG_MMU
474static __latent_entropy int dup_mmap(struct mm_struct *mm,
475 struct mm_struct *oldmm)
476{
477 struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
478 struct rb_node **rb_link, *rb_parent;
479 int retval;
480 unsigned long charge;
481 LIST_HEAD(uf);
482
483 uprobe_start_dup_mmap();
484 if (mmap_write_lock_killable(oldmm)) {
485 retval = -EINTR;
486 goto fail_uprobe_end;
487 }
488 flush_cache_dup_mm(oldmm);
489 uprobe_dup_mmap(oldmm, mm);
490
491
492
493 mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
494
495
496 RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
497
498 mm->total_vm = oldmm->total_vm;
499 mm->data_vm = oldmm->data_vm;
500 mm->exec_vm = oldmm->exec_vm;
501 mm->stack_vm = oldmm->stack_vm;
502
503 rb_link = &mm->mm_rb.rb_node;
504 rb_parent = NULL;
505 pprev = &mm->mmap;
506 retval = ksm_fork(mm, oldmm);
507 if (retval)
508 goto out;
509 retval = khugepaged_fork(mm, oldmm);
510 if (retval)
511 goto out;
512
513 prev = NULL;
514 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
515 struct file *file;
516
517 if (mpnt->vm_flags & VM_DONTCOPY) {
518 vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
519 continue;
520 }
521 charge = 0;
522
523
524
525
526 if (fatal_signal_pending(current)) {
527 retval = -EINTR;
528 goto out;
529 }
530 if (mpnt->vm_flags & VM_ACCOUNT) {
531 unsigned long len = vma_pages(mpnt);
532
533 if (security_vm_enough_memory_mm(oldmm, len))
534 goto fail_nomem;
535 charge = len;
536 }
537 tmp = vm_area_dup(mpnt);
538 if (!tmp)
539 goto fail_nomem;
540 retval = vma_dup_policy(mpnt, tmp);
541 if (retval)
542 goto fail_nomem_policy;
543 tmp->vm_mm = mm;
544 retval = dup_userfaultfd(tmp, &uf);
545 if (retval)
546 goto fail_nomem_anon_vma_fork;
547 if (tmp->vm_flags & VM_WIPEONFORK) {
548
549
550
551
552
553 tmp->anon_vma = NULL;
554 } else if (anon_vma_fork(tmp, mpnt))
555 goto fail_nomem_anon_vma_fork;
556 tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
557 file = tmp->vm_file;
558 if (file) {
559 struct inode *inode = file_inode(file);
560 struct address_space *mapping = file->f_mapping;
561
562 get_file(file);
563 if (tmp->vm_flags & VM_DENYWRITE)
564 put_write_access(inode);
565 i_mmap_lock_write(mapping);
566 if (tmp->vm_flags & VM_SHARED)
567 mapping_allow_writable(mapping);
568 flush_dcache_mmap_lock(mapping);
569
570 vma_interval_tree_insert_after(tmp, mpnt,
571 &mapping->i_mmap);
572 flush_dcache_mmap_unlock(mapping);
573 i_mmap_unlock_write(mapping);
574 }
575
576
577
578
579
580
581 if (is_vm_hugetlb_page(tmp))
582 reset_vma_resv_huge_pages(tmp);
583
584
585
586
587 *pprev = tmp;
588 pprev = &tmp->vm_next;
589 tmp->vm_prev = prev;
590 prev = tmp;
591
592 __vma_link_rb(mm, tmp, rb_link, rb_parent);
593 rb_link = &tmp->vm_rb.rb_right;
594 rb_parent = &tmp->vm_rb;
595
596 mm->map_count++;
597 if (!(tmp->vm_flags & VM_WIPEONFORK))
598 retval = copy_page_range(tmp, mpnt);
599
600 if (tmp->vm_ops && tmp->vm_ops->open)
601 tmp->vm_ops->open(tmp);
602
603 if (retval)
604 goto out;
605 }
606
607 retval = arch_dup_mmap(oldmm, mm);
608out:
609 mmap_write_unlock(mm);
610 flush_tlb_mm(oldmm);
611 mmap_write_unlock(oldmm);
612 dup_userfaultfd_complete(&uf);
613fail_uprobe_end:
614 uprobe_end_dup_mmap();
615 return retval;
616fail_nomem_anon_vma_fork:
617 mpol_put(vma_policy(tmp));
618fail_nomem_policy:
619 vm_area_free(tmp);
620fail_nomem:
621 retval = -ENOMEM;
622 vm_unacct_memory(charge);
623 goto out;
624}
625
626static inline int mm_alloc_pgd(struct mm_struct *mm)
627{
628 mm->pgd = pgd_alloc(mm);
629 if (unlikely(!mm->pgd))
630 return -ENOMEM;
631 return 0;
632}
633
634static inline void mm_free_pgd(struct mm_struct *mm)
635{
636 pgd_free(mm, mm->pgd);
637}
638#else
639static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
640{
641 mmap_write_lock(oldmm);
642 RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
643 mmap_write_unlock(oldmm);
644 return 0;
645}
646#define mm_alloc_pgd(mm) (0)
647#define mm_free_pgd(mm)
648#endif
649
650static void check_mm(struct mm_struct *mm)
651{
652 int i;
653
654 BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
655 "Please make sure 'struct resident_page_types[]' is updated as well");
656
657 for (i = 0; i < NR_MM_COUNTERS; i++) {
658 long x = atomic_long_read(&mm->rss_stat.count[i]);
659
660 if (unlikely(x))
661 pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
662 mm, resident_page_types[i], x);
663 }
664
665 if (mm_pgtables_bytes(mm))
666 pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
667 mm_pgtables_bytes(mm));
668
669#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
670 VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
671#endif
672}
673
674#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
675#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
676
677
678
679
680
681
682void __mmdrop(struct mm_struct *mm)
683{
684 BUG_ON(mm == &init_mm);
685 WARN_ON_ONCE(mm == current->mm);
686 WARN_ON_ONCE(mm == current->active_mm);
687 mm_free_pgd(mm);
688 destroy_context(mm);
689 mmu_notifier_subscriptions_destroy(mm);
690 check_mm(mm);
691 put_user_ns(mm->user_ns);
692 free_mm(mm);
693}
694EXPORT_SYMBOL_GPL(__mmdrop);
695
696static void mmdrop_async_fn(struct work_struct *work)
697{
698 struct mm_struct *mm;
699
700 mm = container_of(work, struct mm_struct, async_put_work);
701 __mmdrop(mm);
702}
703
704static void mmdrop_async(struct mm_struct *mm)
705{
706 if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
707 INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
708 schedule_work(&mm->async_put_work);
709 }
710}
711
712static inline void free_signal_struct(struct signal_struct *sig)
713{
714 taskstats_tgid_free(sig);
715 sched_autogroup_exit(sig);
716
717
718
719
720 if (sig->oom_mm)
721 mmdrop_async(sig->oom_mm);
722 kmem_cache_free(signal_cachep, sig);
723}
724
725static inline void put_signal_struct(struct signal_struct *sig)
726{
727 if (refcount_dec_and_test(&sig->sigcnt))
728 free_signal_struct(sig);
729}
730
731void __put_task_struct(struct task_struct *tsk)
732{
733 WARN_ON(!tsk->exit_state);
734 WARN_ON(refcount_read(&tsk->usage));
735 WARN_ON(tsk == current);
736
737 io_uring_free(tsk);
738 cgroup_free(tsk);
739 task_numa_free(tsk, true);
740 security_task_free(tsk);
741 bpf_task_storage_free(tsk);
742 exit_creds(tsk);
743 delayacct_tsk_free(tsk);
744 put_signal_struct(tsk->signal);
745 sched_core_free(tsk);
746
747 if (!profile_handoff_task(tsk))
748 free_task(tsk);
749}
750EXPORT_SYMBOL_GPL(__put_task_struct);
751
752void __init __weak arch_task_cache_init(void) { }
753
754
755
756
757static void set_max_threads(unsigned int max_threads_suggested)
758{
759 u64 threads;
760 unsigned long nr_pages = totalram_pages();
761
762
763
764
765
766 if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64)
767 threads = MAX_THREADS;
768 else
769 threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE,
770 (u64) THREAD_SIZE * 8UL);
771
772 if (threads > max_threads_suggested)
773 threads = max_threads_suggested;
774
775 max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
776}
777
778#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
779
780int arch_task_struct_size __read_mostly;
781#endif
782
783#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
784static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
785{
786
787 arch_thread_struct_whitelist(offset, size);
788
789
790
791
792
793 if (unlikely(*size == 0))
794 *offset = 0;
795 else
796 *offset += offsetof(struct task_struct, thread);
797}
798#endif
799
800void __init fork_init(void)
801{
802 int i;
803#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
804#ifndef ARCH_MIN_TASKALIGN
805#define ARCH_MIN_TASKALIGN 0
806#endif
807 int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
808 unsigned long useroffset, usersize;
809
810
811 task_struct_whitelist(&useroffset, &usersize);
812 task_struct_cachep = kmem_cache_create_usercopy("task_struct",
813 arch_task_struct_size, align,
814 SLAB_PANIC|SLAB_ACCOUNT,
815 useroffset, usersize, NULL);
816#endif
817
818
819 arch_task_cache_init();
820
821 set_max_threads(MAX_THREADS);
822
823 init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
824 init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
825 init_task.signal->rlim[RLIMIT_SIGPENDING] =
826 init_task.signal->rlim[RLIMIT_NPROC];
827
828 for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++)
829 init_user_ns.ucount_max[i] = max_threads/2;
830
831 set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, RLIM_INFINITY);
832 set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, RLIM_INFINITY);
833 set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY);
834 set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, RLIM_INFINITY);
835
836#ifdef CONFIG_VMAP_STACK
837 cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
838 NULL, free_vm_stack_cache);
839#endif
840
841 scs_init();
842
843 lockdep_init_task(&init_task);
844 uprobes_init();
845}
846
847int __weak arch_dup_task_struct(struct task_struct *dst,
848 struct task_struct *src)
849{
850 *dst = *src;
851 return 0;
852}
853
854void set_task_stack_end_magic(struct task_struct *tsk)
855{
856 unsigned long *stackend;
857
858 stackend = end_of_stack(tsk);
859 *stackend = STACK_END_MAGIC;
860}
861
862static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
863{
864 struct task_struct *tsk;
865 unsigned long *stack;
866 struct vm_struct *stack_vm_area __maybe_unused;
867 int err;
868
869 if (node == NUMA_NO_NODE)
870 node = tsk_fork_get_node(orig);
871 tsk = alloc_task_struct_node(node);
872 if (!tsk)
873 return NULL;
874
875 stack = alloc_thread_stack_node(tsk, node);
876 if (!stack)
877 goto free_tsk;
878
879 if (memcg_charge_kernel_stack(tsk))
880 goto free_stack;
881
882 stack_vm_area = task_stack_vm_area(tsk);
883
884 err = arch_dup_task_struct(tsk, orig);
885
886
887
888
889
890
891 tsk->stack = stack;
892#ifdef CONFIG_VMAP_STACK
893 tsk->stack_vm_area = stack_vm_area;
894#endif
895#ifdef CONFIG_THREAD_INFO_IN_TASK
896 refcount_set(&tsk->stack_refcount, 1);
897#endif
898
899 if (err)
900 goto free_stack;
901
902 err = scs_prepare(tsk, node);
903 if (err)
904 goto free_stack;
905
906#ifdef CONFIG_SECCOMP
907
908
909
910
911
912
913 tsk->seccomp.filter = NULL;
914#endif
915
916 setup_thread_stack(tsk, orig);
917 clear_user_return_notifier(tsk);
918 clear_tsk_need_resched(tsk);
919 set_task_stack_end_magic(tsk);
920 clear_syscall_work_syscall_user_dispatch(tsk);
921
922#ifdef CONFIG_STACKPROTECTOR
923 tsk->stack_canary = get_random_canary();
924#endif
925 if (orig->cpus_ptr == &orig->cpus_mask)
926 tsk->cpus_ptr = &tsk->cpus_mask;
927
928
929
930
931
932 refcount_set(&tsk->rcu_users, 2);
933
934 refcount_set(&tsk->usage, 1);
935#ifdef CONFIG_BLK_DEV_IO_TRACE
936 tsk->btrace_seq = 0;
937#endif
938 tsk->splice_pipe = NULL;
939 tsk->task_frag.page = NULL;
940 tsk->wake_q.next = NULL;
941 tsk->pf_io_worker = NULL;
942
943 account_kernel_stack(tsk, 1);
944
945 kcov_task_init(tsk);
946 kmap_local_fork(tsk);
947
948#ifdef CONFIG_FAULT_INJECTION
949 tsk->fail_nth = 0;
950#endif
951
952#ifdef CONFIG_BLK_CGROUP
953 tsk->throttle_queue = NULL;
954 tsk->use_memdelay = 0;
955#endif
956
957#ifdef CONFIG_MEMCG
958 tsk->active_memcg = NULL;
959#endif
960 return tsk;
961
962free_stack:
963 free_thread_stack(tsk);
964free_tsk:
965 free_task_struct(tsk);
966 return NULL;
967}
968
969__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
970
971static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
972
973static int __init coredump_filter_setup(char *s)
974{
975 default_dump_filter =
976 (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
977 MMF_DUMP_FILTER_MASK;
978 return 1;
979}
980
981__setup("coredump_filter=", coredump_filter_setup);
982
983#include <linux/init_task.h>
984
985static void mm_init_aio(struct mm_struct *mm)
986{
987#ifdef CONFIG_AIO
988 spin_lock_init(&mm->ioctx_lock);
989 mm->ioctx_table = NULL;
990#endif
991}
992
993static __always_inline void mm_clear_owner(struct mm_struct *mm,
994 struct task_struct *p)
995{
996#ifdef CONFIG_MEMCG
997 if (mm->owner == p)
998 WRITE_ONCE(mm->owner, NULL);
999#endif
1000}
1001
1002static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
1003{
1004#ifdef CONFIG_MEMCG
1005 mm->owner = p;
1006#endif
1007}
1008
1009static void mm_init_pasid(struct mm_struct *mm)
1010{
1011#ifdef CONFIG_IOMMU_SUPPORT
1012 mm->pasid = INIT_PASID;
1013#endif
1014}
1015
1016static void mm_init_uprobes_state(struct mm_struct *mm)
1017{
1018#ifdef CONFIG_UPROBES
1019 mm->uprobes_state.xol_area = NULL;
1020#endif
1021}
1022
1023static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
1024 struct user_namespace *user_ns)
1025{
1026 mm->mmap = NULL;
1027 mm->mm_rb = RB_ROOT;
1028 mm->vmacache_seqnum = 0;
1029 atomic_set(&mm->mm_users, 1);
1030 atomic_set(&mm->mm_count, 1);
1031 seqcount_init(&mm->write_protect_seq);
1032 mmap_init_lock(mm);
1033 INIT_LIST_HEAD(&mm->mmlist);
1034 mm->core_state = NULL;
1035 mm_pgtables_bytes_init(mm);
1036 mm->map_count = 0;
1037 mm->locked_vm = 0;
1038 atomic64_set(&mm->pinned_vm, 0);
1039 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
1040 spin_lock_init(&mm->page_table_lock);
1041 spin_lock_init(&mm->arg_lock);
1042 mm_init_cpumask(mm);
1043 mm_init_aio(mm);
1044 mm_init_owner(mm, p);
1045 mm_init_pasid(mm);
1046 RCU_INIT_POINTER(mm->exe_file, NULL);
1047 mmu_notifier_subscriptions_init(mm);
1048 init_tlb_flush_pending(mm);
1049#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
1050 mm->pmd_huge_pte = NULL;
1051#endif
1052 mm_init_uprobes_state(mm);
1053
1054 if (current->mm) {
1055 mm->flags = current->mm->flags & MMF_INIT_MASK;
1056 mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
1057 } else {
1058 mm->flags = default_dump_filter;
1059 mm->def_flags = 0;
1060 }
1061
1062 if (mm_alloc_pgd(mm))
1063 goto fail_nopgd;
1064
1065 if (init_new_context(p, mm))
1066 goto fail_nocontext;
1067
1068 mm->user_ns = get_user_ns(user_ns);
1069 return mm;
1070
1071fail_nocontext:
1072 mm_free_pgd(mm);
1073fail_nopgd:
1074 free_mm(mm);
1075 return NULL;
1076}
1077
1078
1079
1080
1081struct mm_struct *mm_alloc(void)
1082{
1083 struct mm_struct *mm;
1084
1085 mm = allocate_mm();
1086 if (!mm)
1087 return NULL;
1088
1089 memset(mm, 0, sizeof(*mm));
1090 return mm_init(mm, current, current_user_ns());
1091}
1092
1093static inline void __mmput(struct mm_struct *mm)
1094{
1095 VM_BUG_ON(atomic_read(&mm->mm_users));
1096
1097 uprobe_clear_state(mm);
1098 exit_aio(mm);
1099 ksm_exit(mm);
1100 khugepaged_exit(mm);
1101 exit_mmap(mm);
1102 mm_put_huge_zero_page(mm);
1103 set_mm_exe_file(mm, NULL);
1104 if (!list_empty(&mm->mmlist)) {
1105 spin_lock(&mmlist_lock);
1106 list_del(&mm->mmlist);
1107 spin_unlock(&mmlist_lock);
1108 }
1109 if (mm->binfmt)
1110 module_put(mm->binfmt->module);
1111 mmdrop(mm);
1112}
1113
1114
1115
1116
1117void mmput(struct mm_struct *mm)
1118{
1119 might_sleep();
1120
1121 if (atomic_dec_and_test(&mm->mm_users))
1122 __mmput(mm);
1123}
1124EXPORT_SYMBOL_GPL(mmput);
1125
1126#ifdef CONFIG_MMU
1127static void mmput_async_fn(struct work_struct *work)
1128{
1129 struct mm_struct *mm = container_of(work, struct mm_struct,
1130 async_put_work);
1131
1132 __mmput(mm);
1133}
1134
1135void mmput_async(struct mm_struct *mm)
1136{
1137 if (atomic_dec_and_test(&mm->mm_users)) {
1138 INIT_WORK(&mm->async_put_work, mmput_async_fn);
1139 schedule_work(&mm->async_put_work);
1140 }
1141}
1142#endif
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
1156{
1157 struct file *old_exe_file;
1158
1159
1160
1161
1162
1163
1164 old_exe_file = rcu_dereference_raw(mm->exe_file);
1165
1166 if (new_exe_file)
1167 get_file(new_exe_file);
1168 rcu_assign_pointer(mm->exe_file, new_exe_file);
1169 if (old_exe_file)
1170 fput(old_exe_file);
1171}
1172
1173
1174
1175
1176
1177
1178
1179struct file *get_mm_exe_file(struct mm_struct *mm)
1180{
1181 struct file *exe_file;
1182
1183 rcu_read_lock();
1184 exe_file = rcu_dereference(mm->exe_file);
1185 if (exe_file && !get_file_rcu(exe_file))
1186 exe_file = NULL;
1187 rcu_read_unlock();
1188 return exe_file;
1189}
1190EXPORT_SYMBOL(get_mm_exe_file);
1191
1192
1193
1194
1195
1196
1197
1198
1199struct file *get_task_exe_file(struct task_struct *task)
1200{
1201 struct file *exe_file = NULL;
1202 struct mm_struct *mm;
1203
1204 task_lock(task);
1205 mm = task->mm;
1206 if (mm) {
1207 if (!(task->flags & PF_KTHREAD))
1208 exe_file = get_mm_exe_file(mm);
1209 }
1210 task_unlock(task);
1211 return exe_file;
1212}
1213EXPORT_SYMBOL(get_task_exe_file);
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224struct mm_struct *get_task_mm(struct task_struct *task)
1225{
1226 struct mm_struct *mm;
1227
1228 task_lock(task);
1229 mm = task->mm;
1230 if (mm) {
1231 if (task->flags & PF_KTHREAD)
1232 mm = NULL;
1233 else
1234 mmget(mm);
1235 }
1236 task_unlock(task);
1237 return mm;
1238}
1239EXPORT_SYMBOL_GPL(get_task_mm);
1240
1241struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
1242{
1243 struct mm_struct *mm;
1244 int err;
1245
1246 err = down_read_killable(&task->signal->exec_update_lock);
1247 if (err)
1248 return ERR_PTR(err);
1249
1250 mm = get_task_mm(task);
1251 if (mm && mm != current->mm &&
1252 !ptrace_may_access(task, mode)) {
1253 mmput(mm);
1254 mm = ERR_PTR(-EACCES);
1255 }
1256 up_read(&task->signal->exec_update_lock);
1257
1258 return mm;
1259}
1260
1261static void complete_vfork_done(struct task_struct *tsk)
1262{
1263 struct completion *vfork;
1264
1265 task_lock(tsk);
1266 vfork = tsk->vfork_done;
1267 if (likely(vfork)) {
1268 tsk->vfork_done = NULL;
1269 complete(vfork);
1270 }
1271 task_unlock(tsk);
1272}
1273
1274static int wait_for_vfork_done(struct task_struct *child,
1275 struct completion *vfork)
1276{
1277 int killed;
1278
1279 freezer_do_not_count();
1280 cgroup_enter_frozen();
1281 killed = wait_for_completion_killable(vfork);
1282 cgroup_leave_frozen(false);
1283 freezer_count();
1284
1285 if (killed) {
1286 task_lock(child);
1287 child->vfork_done = NULL;
1288 task_unlock(child);
1289 }
1290
1291 put_task_struct(child);
1292 return killed;
1293}
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
1309{
1310 uprobe_free_utask(tsk);
1311
1312
1313 deactivate_mm(tsk, mm);
1314
1315
1316
1317
1318
1319
1320 if (tsk->clear_child_tid) {
1321 if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) &&
1322 atomic_read(&mm->mm_users) > 1) {
1323
1324
1325
1326
1327 put_user(0, tsk->clear_child_tid);
1328 do_futex(tsk->clear_child_tid, FUTEX_WAKE,
1329 1, NULL, NULL, 0, 0);
1330 }
1331 tsk->clear_child_tid = NULL;
1332 }
1333
1334
1335
1336
1337
1338 if (tsk->vfork_done)
1339 complete_vfork_done(tsk);
1340}
1341
1342void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm)
1343{
1344 futex_exit_release(tsk);
1345 mm_release(tsk, mm);
1346}
1347
1348void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm)
1349{
1350 futex_exec_release(tsk);
1351 mm_release(tsk, mm);
1352}
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364static struct mm_struct *dup_mm(struct task_struct *tsk,
1365 struct mm_struct *oldmm)
1366{
1367 struct mm_struct *mm;
1368 int err;
1369
1370 mm = allocate_mm();
1371 if (!mm)
1372 goto fail_nomem;
1373
1374 memcpy(mm, oldmm, sizeof(*mm));
1375
1376 if (!mm_init(mm, tsk, mm->user_ns))
1377 goto fail_nomem;
1378
1379 err = dup_mmap(mm, oldmm);
1380 if (err)
1381 goto free_pt;
1382
1383 mm->hiwater_rss = get_mm_rss(mm);
1384 mm->hiwater_vm = mm->total_vm;
1385
1386 if (mm->binfmt && !try_module_get(mm->binfmt->module))
1387 goto free_pt;
1388
1389 return mm;
1390
1391free_pt:
1392
1393 mm->binfmt = NULL;
1394 mm_init_owner(mm, NULL);
1395 mmput(mm);
1396
1397fail_nomem:
1398 return NULL;
1399}
1400
1401static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
1402{
1403 struct mm_struct *mm, *oldmm;
1404
1405 tsk->min_flt = tsk->maj_flt = 0;
1406 tsk->nvcsw = tsk->nivcsw = 0;
1407#ifdef CONFIG_DETECT_HUNG_TASK
1408 tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
1409 tsk->last_switch_time = 0;
1410#endif
1411
1412 tsk->mm = NULL;
1413 tsk->active_mm = NULL;
1414
1415
1416
1417
1418
1419
1420 oldmm = current->mm;
1421 if (!oldmm)
1422 return 0;
1423
1424
1425 vmacache_flush(tsk);
1426
1427 if (clone_flags & CLONE_VM) {
1428 mmget(oldmm);
1429 mm = oldmm;
1430 } else {
1431 mm = dup_mm(tsk, current->mm);
1432 if (!mm)
1433 return -ENOMEM;
1434 }
1435
1436 tsk->mm = mm;
1437 tsk->active_mm = mm;
1438 return 0;
1439}
1440
1441static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
1442{
1443 struct fs_struct *fs = current->fs;
1444 if (clone_flags & CLONE_FS) {
1445
1446 spin_lock(&fs->lock);
1447 if (fs->in_exec) {
1448 spin_unlock(&fs->lock);
1449 return -EAGAIN;
1450 }
1451 fs->users++;
1452 spin_unlock(&fs->lock);
1453 return 0;
1454 }
1455 tsk->fs = copy_fs_struct(fs);
1456 if (!tsk->fs)
1457 return -ENOMEM;
1458 return 0;
1459}
1460
1461static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
1462{
1463 struct files_struct *oldf, *newf;
1464 int error = 0;
1465
1466
1467
1468
1469 oldf = current->files;
1470 if (!oldf)
1471 goto out;
1472
1473 if (clone_flags & CLONE_FILES) {
1474 atomic_inc(&oldf->count);
1475 goto out;
1476 }
1477
1478 newf = dup_fd(oldf, NR_OPEN_MAX, &error);
1479 if (!newf)
1480 goto out;
1481
1482 tsk->files = newf;
1483 error = 0;
1484out:
1485 return error;
1486}
1487
1488static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
1489{
1490#ifdef CONFIG_BLOCK
1491 struct io_context *ioc = current->io_context;
1492 struct io_context *new_ioc;
1493
1494 if (!ioc)
1495 return 0;
1496
1497
1498
1499 if (clone_flags & CLONE_IO) {
1500 ioc_task_link(ioc);
1501 tsk->io_context = ioc;
1502 } else if (ioprio_valid(ioc->ioprio)) {
1503 new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
1504 if (unlikely(!new_ioc))
1505 return -ENOMEM;
1506
1507 new_ioc->ioprio = ioc->ioprio;
1508 put_io_context(new_ioc);
1509 }
1510#endif
1511 return 0;
1512}
1513
1514static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
1515{
1516 struct sighand_struct *sig;
1517
1518 if (clone_flags & CLONE_SIGHAND) {
1519 refcount_inc(¤t->sighand->count);
1520 return 0;
1521 }
1522 sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
1523 RCU_INIT_POINTER(tsk->sighand, sig);
1524 if (!sig)
1525 return -ENOMEM;
1526
1527 refcount_set(&sig->count, 1);
1528 spin_lock_irq(¤t->sighand->siglock);
1529 memcpy(sig->action, current->sighand->action, sizeof(sig->action));
1530 spin_unlock_irq(¤t->sighand->siglock);
1531
1532
1533 if (clone_flags & CLONE_CLEAR_SIGHAND)
1534 flush_signal_handlers(tsk, 0);
1535
1536 return 0;
1537}
1538
1539void __cleanup_sighand(struct sighand_struct *sighand)
1540{
1541 if (refcount_dec_and_test(&sighand->count)) {
1542 signalfd_cleanup(sighand);
1543
1544
1545
1546
1547 kmem_cache_free(sighand_cachep, sighand);
1548 }
1549}
1550
1551
1552
1553
1554static void posix_cpu_timers_init_group(struct signal_struct *sig)
1555{
1556 struct posix_cputimers *pct = &sig->posix_cputimers;
1557 unsigned long cpu_limit;
1558
1559 cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
1560 posix_cputimers_group_init(pct, cpu_limit);
1561}
1562
1563static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1564{
1565 struct signal_struct *sig;
1566
1567 if (clone_flags & CLONE_THREAD)
1568 return 0;
1569
1570 sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
1571 tsk->signal = sig;
1572 if (!sig)
1573 return -ENOMEM;
1574
1575 sig->nr_threads = 1;
1576 atomic_set(&sig->live, 1);
1577 refcount_set(&sig->sigcnt, 1);
1578
1579
1580 sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
1581 tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head);
1582
1583 init_waitqueue_head(&sig->wait_chldexit);
1584 sig->curr_target = tsk;
1585 init_sigpending(&sig->shared_pending);
1586 INIT_HLIST_HEAD(&sig->multiprocess);
1587 seqlock_init(&sig->stats_lock);
1588 prev_cputime_init(&sig->prev_cputime);
1589
1590#ifdef CONFIG_POSIX_TIMERS
1591 INIT_LIST_HEAD(&sig->posix_timers);
1592 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1593 sig->real_timer.function = it_real_fn;
1594#endif
1595
1596 task_lock(current->group_leader);
1597 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
1598 task_unlock(current->group_leader);
1599
1600 posix_cpu_timers_init_group(sig);
1601
1602 tty_audit_fork(sig);
1603 sched_autogroup_fork(sig);
1604
1605 sig->oom_score_adj = current->signal->oom_score_adj;
1606 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
1607
1608 mutex_init(&sig->cred_guard_mutex);
1609 init_rwsem(&sig->exec_update_lock);
1610
1611 return 0;
1612}
1613
1614static void copy_seccomp(struct task_struct *p)
1615{
1616#ifdef CONFIG_SECCOMP
1617
1618
1619
1620
1621
1622
1623 assert_spin_locked(¤t->sighand->siglock);
1624
1625
1626 get_seccomp_filter(current);
1627 p->seccomp = current->seccomp;
1628
1629
1630
1631
1632
1633
1634 if (task_no_new_privs(current))
1635 task_set_no_new_privs(p);
1636
1637
1638
1639
1640
1641
1642 if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
1643 set_task_syscall_work(p, SECCOMP);
1644#endif
1645}
1646
1647SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
1648{
1649 current->clear_child_tid = tidptr;
1650
1651 return task_pid_vnr(current);
1652}
1653
1654static void rt_mutex_init_task(struct task_struct *p)
1655{
1656 raw_spin_lock_init(&p->pi_lock);
1657#ifdef CONFIG_RT_MUTEXES
1658 p->pi_waiters = RB_ROOT_CACHED;
1659 p->pi_top_task = NULL;
1660 p->pi_blocked_on = NULL;
1661#endif
1662}
1663
1664static inline void init_task_pid_links(struct task_struct *task)
1665{
1666 enum pid_type type;
1667
1668 for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type)
1669 INIT_HLIST_NODE(&task->pid_links[type]);
1670}
1671
1672static inline void
1673init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
1674{
1675 if (type == PIDTYPE_PID)
1676 task->thread_pid = pid;
1677 else
1678 task->signal->pids[type] = pid;
1679}
1680
1681static inline void rcu_copy_process(struct task_struct *p)
1682{
1683#ifdef CONFIG_PREEMPT_RCU
1684 p->rcu_read_lock_nesting = 0;
1685 p->rcu_read_unlock_special.s = 0;
1686 p->rcu_blocked_node = NULL;
1687 INIT_LIST_HEAD(&p->rcu_node_entry);
1688#endif
1689#ifdef CONFIG_TASKS_RCU
1690 p->rcu_tasks_holdout = false;
1691 INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
1692 p->rcu_tasks_idle_cpu = -1;
1693#endif
1694#ifdef CONFIG_TASKS_TRACE_RCU
1695 p->trc_reader_nesting = 0;
1696 p->trc_reader_special.s = 0;
1697 INIT_LIST_HEAD(&p->trc_holdout_list);
1698#endif
1699}
1700
1701struct pid *pidfd_pid(const struct file *file)
1702{
1703 if (file->f_op == &pidfd_fops)
1704 return file->private_data;
1705
1706 return ERR_PTR(-EBADF);
1707}
1708
1709static int pidfd_release(struct inode *inode, struct file *file)
1710{
1711 struct pid *pid = file->private_data;
1712
1713 file->private_data = NULL;
1714 put_pid(pid);
1715 return 0;
1716}
1717
1718#ifdef CONFIG_PROC_FS
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
1755{
1756 struct pid *pid = f->private_data;
1757 struct pid_namespace *ns;
1758 pid_t nr = -1;
1759
1760 if (likely(pid_has_task(pid, PIDTYPE_PID))) {
1761 ns = proc_pid_ns(file_inode(m->file)->i_sb);
1762 nr = pid_nr_ns(pid, ns);
1763 }
1764
1765 seq_put_decimal_ll(m, "Pid:\t", nr);
1766
1767#ifdef CONFIG_PID_NS
1768 seq_put_decimal_ll(m, "\nNSpid:\t", nr);
1769 if (nr > 0) {
1770 int i;
1771
1772
1773
1774
1775
1776
1777 for (i = ns->level + 1; i <= pid->level; i++)
1778 seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
1779 }
1780#endif
1781 seq_putc(m, '\n');
1782}
1783#endif
1784
1785
1786
1787
1788static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
1789{
1790 struct pid *pid = file->private_data;
1791 __poll_t poll_flags = 0;
1792
1793 poll_wait(file, &pid->wait_pidfd, pts);
1794
1795
1796
1797
1798
1799
1800 if (thread_group_exited(pid))
1801 poll_flags = EPOLLIN | EPOLLRDNORM;
1802
1803 return poll_flags;
1804}
1805
1806const struct file_operations pidfd_fops = {
1807 .release = pidfd_release,
1808 .poll = pidfd_poll,
1809#ifdef CONFIG_PROC_FS
1810 .show_fdinfo = pidfd_show_fdinfo,
1811#endif
1812};
1813
1814static void __delayed_free_task(struct rcu_head *rhp)
1815{
1816 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
1817
1818 free_task(tsk);
1819}
1820
1821static __always_inline void delayed_free_task(struct task_struct *tsk)
1822{
1823 if (IS_ENABLED(CONFIG_MEMCG))
1824 call_rcu(&tsk->rcu, __delayed_free_task);
1825 else
1826 free_task(tsk);
1827}
1828
1829static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
1830{
1831
1832 if (!tsk->mm)
1833 return;
1834
1835
1836 if ((clone_flags & (CLONE_VM | CLONE_THREAD | CLONE_VFORK)) != CLONE_VM)
1837 return;
1838
1839
1840 mutex_lock(&oom_adj_mutex);
1841 set_bit(MMF_MULTIPROCESS, &tsk->mm->flags);
1842
1843 tsk->signal->oom_score_adj = current->signal->oom_score_adj;
1844 tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min;
1845 mutex_unlock(&oom_adj_mutex);
1846}
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856static __latent_entropy struct task_struct *copy_process(
1857 struct pid *pid,
1858 int trace,
1859 int node,
1860 struct kernel_clone_args *args)
1861{
1862 int pidfd = -1, retval;
1863 struct task_struct *p;
1864 struct multiprocess_signals delayed;
1865 struct file *pidfile = NULL;
1866 u64 clone_flags = args->flags;
1867 struct nsproxy *nsp = current->nsproxy;
1868
1869
1870
1871
1872
1873 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
1874 return ERR_PTR(-EINVAL);
1875
1876 if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
1877 return ERR_PTR(-EINVAL);
1878
1879
1880
1881
1882
1883 if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
1884 return ERR_PTR(-EINVAL);
1885
1886
1887
1888
1889
1890
1891 if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
1892 return ERR_PTR(-EINVAL);
1893
1894
1895
1896
1897
1898
1899
1900 if ((clone_flags & CLONE_PARENT) &&
1901 current->signal->flags & SIGNAL_UNKILLABLE)
1902 return ERR_PTR(-EINVAL);
1903
1904
1905
1906
1907
1908 if (clone_flags & CLONE_THREAD) {
1909 if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
1910 (task_active_pid_ns(current) != nsp->pid_ns_for_children))
1911 return ERR_PTR(-EINVAL);
1912 }
1913
1914
1915
1916
1917
1918 if (clone_flags & (CLONE_THREAD | CLONE_VM)) {
1919 if (nsp->time_ns != nsp->time_ns_for_children)
1920 return ERR_PTR(-EINVAL);
1921 }
1922
1923 if (clone_flags & CLONE_PIDFD) {
1924
1925
1926
1927
1928
1929 if (clone_flags & (CLONE_DETACHED | CLONE_THREAD))
1930 return ERR_PTR(-EINVAL);
1931 }
1932
1933
1934
1935
1936
1937
1938
1939 sigemptyset(&delayed.signal);
1940 INIT_HLIST_NODE(&delayed.node);
1941
1942 spin_lock_irq(¤t->sighand->siglock);
1943 if (!(clone_flags & CLONE_THREAD))
1944 hlist_add_head(&delayed.node, ¤t->signal->multiprocess);
1945 recalc_sigpending();
1946 spin_unlock_irq(¤t->sighand->siglock);
1947 retval = -ERESTARTNOINTR;
1948 if (task_sigpending(current))
1949 goto fork_out;
1950
1951 retval = -ENOMEM;
1952 p = dup_task_struct(current, node);
1953 if (!p)
1954 goto fork_out;
1955 if (args->io_thread) {
1956
1957
1958
1959
1960 p->flags |= PF_IO_WORKER;
1961 siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
1962 }
1963
1964
1965
1966
1967
1968
1969
1970 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
1971
1972
1973
1974 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;
1975
1976 ftrace_graph_init_task(p);
1977
1978 rt_mutex_init_task(p);
1979
1980 lockdep_assert_irqs_enabled();
1981#ifdef CONFIG_PROVE_LOCKING
1982 DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
1983#endif
1984 retval = -EAGAIN;
1985 if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
1986 if (p->real_cred->user != INIT_USER &&
1987 !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
1988 goto bad_fork_free;
1989 }
1990 current->flags &= ~PF_NPROC_EXCEEDED;
1991
1992 retval = copy_creds(p, clone_flags);
1993 if (retval < 0)
1994 goto bad_fork_free;
1995
1996
1997
1998
1999
2000
2001 retval = -EAGAIN;
2002 if (data_race(nr_threads >= max_threads))
2003 goto bad_fork_cleanup_count;
2004
2005 delayacct_tsk_init(p);
2006 p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE | PF_NO_SETAFFINITY);
2007 p->flags |= PF_FORKNOEXEC;
2008 INIT_LIST_HEAD(&p->children);
2009 INIT_LIST_HEAD(&p->sibling);
2010 rcu_copy_process(p);
2011 p->vfork_done = NULL;
2012 spin_lock_init(&p->alloc_lock);
2013
2014 init_sigpending(&p->pending);
2015
2016 p->utime = p->stime = p->gtime = 0;
2017#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
2018 p->utimescaled = p->stimescaled = 0;
2019#endif
2020 prev_cputime_init(&p->prev_cputime);
2021
2022#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
2023 seqcount_init(&p->vtime.seqcount);
2024 p->vtime.starttime = 0;
2025 p->vtime.state = VTIME_INACTIVE;
2026#endif
2027
2028#ifdef CONFIG_IO_URING
2029 p->io_uring = NULL;
2030#endif
2031
2032#if defined(SPLIT_RSS_COUNTING)
2033 memset(&p->rss_stat, 0, sizeof(p->rss_stat));
2034#endif
2035
2036 p->default_timer_slack_ns = current->timer_slack_ns;
2037
2038#ifdef CONFIG_PSI
2039 p->psi_flags = 0;
2040#endif
2041
2042 task_io_accounting_init(&p->ioac);
2043 acct_clear_integrals(p);
2044
2045 posix_cputimers_init(&p->posix_cputimers);
2046
2047 p->io_context = NULL;
2048 audit_set_context(p, NULL);
2049 cgroup_fork(p);
2050#ifdef CONFIG_NUMA
2051 p->mempolicy = mpol_dup(p->mempolicy);
2052 if (IS_ERR(p->mempolicy)) {
2053 retval = PTR_ERR(p->mempolicy);
2054 p->mempolicy = NULL;
2055 goto bad_fork_cleanup_threadgroup_lock;
2056 }
2057#endif
2058#ifdef CONFIG_CPUSETS
2059 p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
2060 p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
2061 seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
2062#endif
2063#ifdef CONFIG_TRACE_IRQFLAGS
2064 memset(&p->irqtrace, 0, sizeof(p->irqtrace));
2065 p->irqtrace.hardirq_disable_ip = _THIS_IP_;
2066 p->irqtrace.softirq_enable_ip = _THIS_IP_;
2067 p->softirqs_enabled = 1;
2068 p->softirq_context = 0;
2069#endif
2070
2071 p->pagefault_disabled = 0;
2072
2073#ifdef CONFIG_LOCKDEP
2074 lockdep_init_task(p);
2075#endif
2076
2077#ifdef CONFIG_DEBUG_MUTEXES
2078 p->blocked_on = NULL;
2079#endif
2080#ifdef CONFIG_BCACHE
2081 p->sequential_io = 0;
2082 p->sequential_io_avg = 0;
2083#endif
2084#ifdef CONFIG_BPF_SYSCALL
2085 RCU_INIT_POINTER(p->bpf_storage, NULL);
2086#endif
2087
2088
2089 retval = sched_fork(clone_flags, p);
2090 if (retval)
2091 goto bad_fork_cleanup_policy;
2092
2093 retval = perf_event_init_task(p, clone_flags);
2094 if (retval)
2095 goto bad_fork_cleanup_policy;
2096 retval = audit_alloc(p);
2097 if (retval)
2098 goto bad_fork_cleanup_perf;
2099
2100 shm_init_task(p);
2101 retval = security_task_alloc(p, clone_flags);
2102 if (retval)
2103 goto bad_fork_cleanup_audit;
2104 retval = copy_semundo(clone_flags, p);
2105 if (retval)
2106 goto bad_fork_cleanup_security;
2107 retval = copy_files(clone_flags, p);
2108 if (retval)
2109 goto bad_fork_cleanup_semundo;
2110 retval = copy_fs(clone_flags, p);
2111 if (retval)
2112 goto bad_fork_cleanup_files;
2113 retval = copy_sighand(clone_flags, p);
2114 if (retval)
2115 goto bad_fork_cleanup_fs;
2116 retval = copy_signal(clone_flags, p);
2117 if (retval)
2118 goto bad_fork_cleanup_sighand;
2119 retval = copy_mm(clone_flags, p);
2120 if (retval)
2121 goto bad_fork_cleanup_signal;
2122 retval = copy_namespaces(clone_flags, p);
2123 if (retval)
2124 goto bad_fork_cleanup_mm;
2125 retval = copy_io(clone_flags, p);
2126 if (retval)
2127 goto bad_fork_cleanup_namespaces;
2128 retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls);
2129 if (retval)
2130 goto bad_fork_cleanup_io;
2131
2132 stackleak_task_init(p);
2133
2134 if (pid != &init_struct_pid) {
2135 pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
2136 args->set_tid_size);
2137 if (IS_ERR(pid)) {
2138 retval = PTR_ERR(pid);
2139 goto bad_fork_cleanup_thread;
2140 }
2141 }
2142
2143
2144
2145
2146
2147
2148 if (clone_flags & CLONE_PIDFD) {
2149 retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
2150 if (retval < 0)
2151 goto bad_fork_free_pid;
2152
2153 pidfd = retval;
2154
2155 pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
2156 O_RDWR | O_CLOEXEC);
2157 if (IS_ERR(pidfile)) {
2158 put_unused_fd(pidfd);
2159 retval = PTR_ERR(pidfile);
2160 goto bad_fork_free_pid;
2161 }
2162 get_pid(pid);
2163
2164 retval = put_user(pidfd, args->pidfd);
2165 if (retval)
2166 goto bad_fork_put_pidfd;
2167 }
2168
2169#ifdef CONFIG_BLOCK
2170 p->plug = NULL;
2171#endif
2172 futex_init_task(p);
2173
2174
2175
2176
2177 if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
2178 sas_ss_reset(p);
2179
2180
2181
2182
2183
2184 user_disable_single_step(p);
2185 clear_task_syscall_work(p, SYSCALL_TRACE);
2186#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
2187 clear_task_syscall_work(p, SYSCALL_EMU);
2188#endif
2189 clear_tsk_latency_tracing(p);
2190
2191
2192 p->pid = pid_nr(pid);
2193 if (clone_flags & CLONE_THREAD) {
2194 p->group_leader = current->group_leader;
2195 p->tgid = current->tgid;
2196 } else {
2197 p->group_leader = p;
2198 p->tgid = p->pid;
2199 }
2200
2201 p->nr_dirtied = 0;
2202 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
2203 p->dirty_paused_when = 0;
2204
2205 p->pdeath_signal = 0;
2206 INIT_LIST_HEAD(&p->thread_group);
2207 p->task_works = NULL;
2208
2209#ifdef CONFIG_KRETPROBES
2210 p->kretprobe_instances.first = NULL;
2211#endif
2212
2213
2214
2215
2216
2217
2218
2219 retval = cgroup_can_fork(p, args);
2220 if (retval)
2221 goto bad_fork_put_pidfd;
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231 p->start_time = ktime_get_ns();
2232 p->start_boottime = ktime_get_boottime_ns();
2233
2234
2235
2236
2237
2238 write_lock_irq(&tasklist_lock);
2239
2240
2241 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
2242 p->real_parent = current->real_parent;
2243 p->parent_exec_id = current->parent_exec_id;
2244 if (clone_flags & CLONE_THREAD)
2245 p->exit_signal = -1;
2246 else
2247 p->exit_signal = current->group_leader->exit_signal;
2248 } else {
2249 p->real_parent = current;
2250 p->parent_exec_id = current->self_exec_id;
2251 p->exit_signal = args->exit_signal;
2252 }
2253
2254 klp_copy_process(p);
2255
2256 sched_core_fork(p);
2257
2258 spin_lock(¤t->sighand->siglock);
2259
2260
2261
2262
2263
2264 copy_seccomp(p);
2265
2266 rseq_fork(p, clone_flags);
2267
2268
2269 if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
2270 retval = -ENOMEM;
2271 goto bad_fork_cancel_cgroup;
2272 }
2273
2274
2275 if (fatal_signal_pending(current)) {
2276 retval = -EINTR;
2277 goto bad_fork_cancel_cgroup;
2278 }
2279
2280
2281 if (pidfile)
2282 fd_install(pidfd, pidfile);
2283
2284 init_task_pid_links(p);
2285 if (likely(p->pid)) {
2286 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
2287
2288 init_task_pid(p, PIDTYPE_PID, pid);
2289 if (thread_group_leader(p)) {
2290 init_task_pid(p, PIDTYPE_TGID, pid);
2291 init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
2292 init_task_pid(p, PIDTYPE_SID, task_session(current));
2293
2294 if (is_child_reaper(pid)) {
2295 ns_of_pid(pid)->child_reaper = p;
2296 p->signal->flags |= SIGNAL_UNKILLABLE;
2297 }
2298 p->signal->shared_pending.signal = delayed.signal;
2299 p->signal->tty = tty_kref_get(current->signal->tty);
2300
2301
2302
2303
2304
2305 p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
2306 p->real_parent->signal->is_child_subreaper;
2307 list_add_tail(&p->sibling, &p->real_parent->children);
2308 list_add_tail_rcu(&p->tasks, &init_task.tasks);
2309 attach_pid(p, PIDTYPE_TGID);
2310 attach_pid(p, PIDTYPE_PGID);
2311 attach_pid(p, PIDTYPE_SID);
2312 __this_cpu_inc(process_counts);
2313 } else {
2314 current->signal->nr_threads++;
2315 atomic_inc(¤t->signal->live);
2316 refcount_inc(¤t->signal->sigcnt);
2317 task_join_group_stop(p);
2318 list_add_tail_rcu(&p->thread_group,
2319 &p->group_leader->thread_group);
2320 list_add_tail_rcu(&p->thread_node,
2321 &p->signal->thread_head);
2322 }
2323 attach_pid(p, PIDTYPE_PID);
2324 nr_threads++;
2325 }
2326 total_forks++;
2327 hlist_del_init(&delayed.node);
2328 spin_unlock(¤t->sighand->siglock);
2329 syscall_tracepoint_update(p);
2330 write_unlock_irq(&tasklist_lock);
2331
2332 proc_fork_connector(p);
2333 sched_post_fork(p);
2334 cgroup_post_fork(p, args);
2335 perf_event_fork(p);
2336
2337 trace_task_newtask(p, clone_flags);
2338 uprobe_copy_process(p, clone_flags);
2339
2340 copy_oom_score_adj(clone_flags, p);
2341
2342 return p;
2343
2344bad_fork_cancel_cgroup:
2345 sched_core_free(p);
2346 spin_unlock(¤t->sighand->siglock);
2347 write_unlock_irq(&tasklist_lock);
2348 cgroup_cancel_fork(p, args);
2349bad_fork_put_pidfd:
2350 if (clone_flags & CLONE_PIDFD) {
2351 fput(pidfile);
2352 put_unused_fd(pidfd);
2353 }
2354bad_fork_free_pid:
2355 if (pid != &init_struct_pid)
2356 free_pid(pid);
2357bad_fork_cleanup_thread:
2358 exit_thread(p);
2359bad_fork_cleanup_io:
2360 if (p->io_context)
2361 exit_io_context(p);
2362bad_fork_cleanup_namespaces:
2363 exit_task_namespaces(p);
2364bad_fork_cleanup_mm:
2365 if (p->mm) {
2366 mm_clear_owner(p->mm, p);
2367 mmput(p->mm);
2368 }
2369bad_fork_cleanup_signal:
2370 if (!(clone_flags & CLONE_THREAD))
2371 free_signal_struct(p->signal);
2372bad_fork_cleanup_sighand:
2373 __cleanup_sighand(p->sighand);
2374bad_fork_cleanup_fs:
2375 exit_fs(p);
2376bad_fork_cleanup_files:
2377 exit_files(p);
2378bad_fork_cleanup_semundo:
2379 exit_sem(p);
2380bad_fork_cleanup_security:
2381 security_task_free(p);
2382bad_fork_cleanup_audit:
2383 audit_free(p);
2384bad_fork_cleanup_perf:
2385 perf_event_free_task(p);
2386bad_fork_cleanup_policy:
2387 lockdep_free_task(p);
2388#ifdef CONFIG_NUMA
2389 mpol_put(p->mempolicy);
2390bad_fork_cleanup_threadgroup_lock:
2391#endif
2392 delayacct_tsk_free(p);
2393bad_fork_cleanup_count:
2394 dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
2395 exit_creds(p);
2396bad_fork_free:
2397 WRITE_ONCE(p->__state, TASK_DEAD);
2398 put_task_stack(p);
2399 delayed_free_task(p);
2400fork_out:
2401 spin_lock_irq(¤t->sighand->siglock);
2402 hlist_del_init(&delayed.node);
2403 spin_unlock_irq(¤t->sighand->siglock);
2404 return ERR_PTR(retval);
2405}
2406
2407static inline void init_idle_pids(struct task_struct *idle)
2408{
2409 enum pid_type type;
2410
2411 for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
2412 INIT_HLIST_NODE(&idle->pid_links[type]);
2413 init_task_pid(idle, type, &init_struct_pid);
2414 }
2415}
2416
2417struct task_struct * __init fork_idle(int cpu)
2418{
2419 struct task_struct *task;
2420 struct kernel_clone_args args = {
2421 .flags = CLONE_VM,
2422 };
2423
2424 task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
2425 if (!IS_ERR(task)) {
2426 init_idle_pids(task);
2427 init_idle(task, cpu);
2428 }
2429
2430 return task;
2431}
2432
2433struct mm_struct *copy_init_mm(void)
2434{
2435 return dup_mm(NULL, &init_mm);
2436}
2437
2438
2439
2440
2441
2442
2443
2444struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
2445{
2446 unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
2447 CLONE_IO;
2448 struct kernel_clone_args args = {
2449 .flags = ((lower_32_bits(flags) | CLONE_VM |
2450 CLONE_UNTRACED) & ~CSIGNAL),
2451 .exit_signal = (lower_32_bits(flags) & CSIGNAL),
2452 .stack = (unsigned long)fn,
2453 .stack_size = (unsigned long)arg,
2454 .io_thread = 1,
2455 };
2456
2457 return copy_process(NULL, 0, node, &args);
2458}
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468pid_t kernel_clone(struct kernel_clone_args *args)
2469{
2470 u64 clone_flags = args->flags;
2471 struct completion vfork;
2472 struct pid *pid;
2473 struct task_struct *p;
2474 int trace = 0;
2475 pid_t nr;
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486 if ((args->flags & CLONE_PIDFD) &&
2487 (args->flags & CLONE_PARENT_SETTID) &&
2488 (args->pidfd == args->parent_tid))
2489 return -EINVAL;
2490
2491
2492
2493
2494
2495
2496
2497 if (!(clone_flags & CLONE_UNTRACED)) {
2498 if (clone_flags & CLONE_VFORK)
2499 trace = PTRACE_EVENT_VFORK;
2500 else if (args->exit_signal != SIGCHLD)
2501 trace = PTRACE_EVENT_CLONE;
2502 else
2503 trace = PTRACE_EVENT_FORK;
2504
2505 if (likely(!ptrace_event_enabled(current, trace)))
2506 trace = 0;
2507 }
2508
2509 p = copy_process(NULL, trace, NUMA_NO_NODE, args);
2510 add_latent_entropy();
2511
2512 if (IS_ERR(p))
2513 return PTR_ERR(p);
2514
2515
2516
2517
2518
2519 trace_sched_process_fork(current, p);
2520
2521 pid = get_task_pid(p, PIDTYPE_PID);
2522 nr = pid_vnr(pid);
2523
2524 if (clone_flags & CLONE_PARENT_SETTID)
2525 put_user(nr, args->parent_tid);
2526
2527 if (clone_flags & CLONE_VFORK) {
2528 p->vfork_done = &vfork;
2529 init_completion(&vfork);
2530 get_task_struct(p);
2531 }
2532
2533 wake_up_new_task(p);
2534
2535
2536 if (unlikely(trace))
2537 ptrace_event_pid(trace, pid);
2538
2539 if (clone_flags & CLONE_VFORK) {
2540 if (!wait_for_vfork_done(p, &vfork))
2541 ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
2542 }
2543
2544 put_pid(pid);
2545 return nr;
2546}
2547
2548
2549
2550
2551pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
2552{
2553 struct kernel_clone_args args = {
2554 .flags = ((lower_32_bits(flags) | CLONE_VM |
2555 CLONE_UNTRACED) & ~CSIGNAL),
2556 .exit_signal = (lower_32_bits(flags) & CSIGNAL),
2557 .stack = (unsigned long)fn,
2558 .stack_size = (unsigned long)arg,
2559 };
2560
2561 return kernel_clone(&args);
2562}
2563
2564#ifdef __ARCH_WANT_SYS_FORK
2565SYSCALL_DEFINE0(fork)
2566{
2567#ifdef CONFIG_MMU
2568 struct kernel_clone_args args = {
2569 .exit_signal = SIGCHLD,
2570 };
2571
2572 return kernel_clone(&args);
2573#else
2574
2575 return -EINVAL;
2576#endif
2577}
2578#endif
2579
2580#ifdef __ARCH_WANT_SYS_VFORK
2581SYSCALL_DEFINE0(vfork)
2582{
2583 struct kernel_clone_args args = {
2584 .flags = CLONE_VFORK | CLONE_VM,
2585 .exit_signal = SIGCHLD,
2586 };
2587
2588 return kernel_clone(&args);
2589}
2590#endif
2591
2592#ifdef __ARCH_WANT_SYS_CLONE
2593#ifdef CONFIG_CLONE_BACKWARDS
2594SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
2595 int __user *, parent_tidptr,
2596 unsigned long, tls,
2597 int __user *, child_tidptr)
2598#elif defined(CONFIG_CLONE_BACKWARDS2)
2599SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
2600 int __user *, parent_tidptr,
2601 int __user *, child_tidptr,
2602 unsigned long, tls)
2603#elif defined(CONFIG_CLONE_BACKWARDS3)
2604SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
2605 int, stack_size,
2606 int __user *, parent_tidptr,
2607 int __user *, child_tidptr,
2608 unsigned long, tls)
2609#else
2610SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
2611 int __user *, parent_tidptr,
2612 int __user *, child_tidptr,
2613 unsigned long, tls)
2614#endif
2615{
2616 struct kernel_clone_args args = {
2617 .flags = (lower_32_bits(clone_flags) & ~CSIGNAL),
2618 .pidfd = parent_tidptr,
2619 .child_tid = child_tidptr,
2620 .parent_tid = parent_tidptr,
2621 .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL),
2622 .stack = newsp,
2623 .tls = tls,
2624 };
2625
2626 return kernel_clone(&args);
2627}
2628#endif
2629
2630#ifdef __ARCH_WANT_SYS_CLONE3
2631
2632noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
2633 struct clone_args __user *uargs,
2634 size_t usize)
2635{
2636 int err;
2637 struct clone_args args;
2638 pid_t *kset_tid = kargs->set_tid;
2639
2640 BUILD_BUG_ON(offsetofend(struct clone_args, tls) !=
2641 CLONE_ARGS_SIZE_VER0);
2642 BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) !=
2643 CLONE_ARGS_SIZE_VER1);
2644 BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
2645 CLONE_ARGS_SIZE_VER2);
2646 BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);
2647
2648 if (unlikely(usize > PAGE_SIZE))
2649 return -E2BIG;
2650 if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
2651 return -EINVAL;
2652
2653 err = copy_struct_from_user(&args, sizeof(args), uargs, usize);
2654 if (err)
2655 return err;
2656
2657 if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
2658 return -EINVAL;
2659
2660 if (unlikely(!args.set_tid && args.set_tid_size > 0))
2661 return -EINVAL;
2662
2663 if (unlikely(args.set_tid && args.set_tid_size == 0))
2664 return -EINVAL;
2665
2666
2667
2668
2669
2670 if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
2671 !valid_signal(args.exit_signal)))
2672 return -EINVAL;
2673
2674 if ((args.flags & CLONE_INTO_CGROUP) &&
2675 (args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2))
2676 return -EINVAL;
2677
2678 *kargs = (struct kernel_clone_args){
2679 .flags = args.flags,
2680 .pidfd = u64_to_user_ptr(args.pidfd),
2681 .child_tid = u64_to_user_ptr(args.child_tid),
2682 .parent_tid = u64_to_user_ptr(args.parent_tid),
2683 .exit_signal = args.exit_signal,
2684 .stack = args.stack,
2685 .stack_size = args.stack_size,
2686 .tls = args.tls,
2687 .set_tid_size = args.set_tid_size,
2688 .cgroup = args.cgroup,
2689 };
2690
2691 if (args.set_tid &&
2692 copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
2693 (kargs->set_tid_size * sizeof(pid_t))))
2694 return -EFAULT;
2695
2696 kargs->set_tid = kset_tid;
2697
2698 return 0;
2699}
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
2710{
2711 if (kargs->stack == 0) {
2712 if (kargs->stack_size > 0)
2713 return false;
2714 } else {
2715 if (kargs->stack_size == 0)
2716 return false;
2717
2718 if (!access_ok((void __user *)kargs->stack, kargs->stack_size))
2719 return false;
2720
2721#if !defined(CONFIG_STACK_GROWSUP) && !defined(CONFIG_IA64)
2722 kargs->stack += kargs->stack_size;
2723#endif
2724 }
2725
2726 return true;
2727}
2728
2729static bool clone3_args_valid(struct kernel_clone_args *kargs)
2730{
2731
2732 if (kargs->flags &
2733 ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
2734 return false;
2735
2736
2737
2738
2739
2740 if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
2741 return false;
2742
2743 if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) ==
2744 (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND))
2745 return false;
2746
2747 if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
2748 kargs->exit_signal)
2749 return false;
2750
2751 if (!clone3_stack_valid(kargs))
2752 return false;
2753
2754 return true;
2755}
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
2769{
2770 int err;
2771
2772 struct kernel_clone_args kargs;
2773 pid_t set_tid[MAX_PID_NS_LEVEL];
2774
2775 kargs.set_tid = set_tid;
2776
2777 err = copy_clone_args_from_user(&kargs, uargs, size);
2778 if (err)
2779 return err;
2780
2781 if (!clone3_args_valid(&kargs))
2782 return -EINVAL;
2783
2784 return kernel_clone(&kargs);
2785}
2786#endif
2787
2788void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data)
2789{
2790 struct task_struct *leader, *parent, *child;
2791 int res;
2792
2793 read_lock(&tasklist_lock);
2794 leader = top = top->group_leader;
2795down:
2796 for_each_thread(leader, parent) {
2797 list_for_each_entry(child, &parent->children, sibling) {
2798 res = visitor(child, data);
2799 if (res) {
2800 if (res < 0)
2801 goto out;
2802 leader = child;
2803 goto down;
2804 }
2805up:
2806 ;
2807 }
2808 }
2809
2810 if (leader != top) {
2811 child = leader;
2812 parent = child->real_parent;
2813 leader = parent->group_leader;
2814 goto up;
2815 }
2816out:
2817 read_unlock(&tasklist_lock);
2818}
2819
2820#ifndef ARCH_MIN_MMSTRUCT_ALIGN
2821#define ARCH_MIN_MMSTRUCT_ALIGN 0
2822#endif
2823
2824static void sighand_ctor(void *data)
2825{
2826 struct sighand_struct *sighand = data;
2827
2828 spin_lock_init(&sighand->siglock);
2829 init_waitqueue_head(&sighand->signalfd_wqh);
2830}
2831
2832void __init proc_caches_init(void)
2833{
2834 unsigned int mm_size;
2835
2836 sighand_cachep = kmem_cache_create("sighand_cache",
2837 sizeof(struct sighand_struct), 0,
2838 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
2839 SLAB_ACCOUNT, sighand_ctor);
2840 signal_cachep = kmem_cache_create("signal_cache",
2841 sizeof(struct signal_struct), 0,
2842 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
2843 NULL);
2844 files_cachep = kmem_cache_create("files_cache",
2845 sizeof(struct files_struct), 0,
2846 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
2847 NULL);
2848 fs_cachep = kmem_cache_create("fs_cache",
2849 sizeof(struct fs_struct), 0,
2850 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
2851 NULL);
2852
2853
2854
2855
2856
2857
2858 mm_size = sizeof(struct mm_struct) + cpumask_size();
2859
2860 mm_cachep = kmem_cache_create_usercopy("mm_struct",
2861 mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
2862 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
2863 offsetof(struct mm_struct, saved_auxv),
2864 sizeof_field(struct mm_struct, saved_auxv),
2865 NULL);
2866 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
2867 mmap_init();
2868 nsproxy_cache_init();
2869}
2870
2871
2872
2873
2874static int check_unshare_flags(unsigned long unshare_flags)
2875{
2876 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
2877 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
2878 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
2879 CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
2880 CLONE_NEWTIME))
2881 return -EINVAL;
2882
2883
2884
2885
2886
2887
2888 if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
2889 if (!thread_group_empty(current))
2890 return -EINVAL;
2891 }
2892 if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
2893 if (refcount_read(¤t->sighand->count) > 1)
2894 return -EINVAL;
2895 }
2896 if (unshare_flags & CLONE_VM) {
2897 if (!current_is_single_threaded())
2898 return -EINVAL;
2899 }
2900
2901 return 0;
2902}
2903
2904
2905
2906
2907static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
2908{
2909 struct fs_struct *fs = current->fs;
2910
2911 if (!(unshare_flags & CLONE_FS) || !fs)
2912 return 0;
2913
2914
2915 if (fs->users == 1)
2916 return 0;
2917
2918 *new_fsp = copy_fs_struct(fs);
2919 if (!*new_fsp)
2920 return -ENOMEM;
2921
2922 return 0;
2923}
2924
2925
2926
2927
2928int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
2929 struct files_struct **new_fdp)
2930{
2931 struct files_struct *fd = current->files;
2932 int error = 0;
2933
2934 if ((unshare_flags & CLONE_FILES) &&
2935 (fd && atomic_read(&fd->count) > 1)) {
2936 *new_fdp = dup_fd(fd, max_fds, &error);
2937 if (!*new_fdp)
2938 return error;
2939 }
2940
2941 return 0;
2942}
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952int ksys_unshare(unsigned long unshare_flags)
2953{
2954 struct fs_struct *fs, *new_fs = NULL;
2955 struct files_struct *fd, *new_fd = NULL;
2956 struct cred *new_cred = NULL;
2957 struct nsproxy *new_nsproxy = NULL;
2958 int do_sysvsem = 0;
2959 int err;
2960
2961
2962
2963
2964
2965 if (unshare_flags & CLONE_NEWUSER)
2966 unshare_flags |= CLONE_THREAD | CLONE_FS;
2967
2968
2969
2970 if (unshare_flags & CLONE_VM)
2971 unshare_flags |= CLONE_SIGHAND;
2972
2973
2974
2975 if (unshare_flags & CLONE_SIGHAND)
2976 unshare_flags |= CLONE_THREAD;
2977
2978
2979
2980 if (unshare_flags & CLONE_NEWNS)
2981 unshare_flags |= CLONE_FS;
2982
2983 err = check_unshare_flags(unshare_flags);
2984 if (err)
2985 goto bad_unshare_out;
2986
2987
2988
2989
2990
2991 if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
2992 do_sysvsem = 1;
2993 err = unshare_fs(unshare_flags, &new_fs);
2994 if (err)
2995 goto bad_unshare_out;
2996 err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd);
2997 if (err)
2998 goto bad_unshare_cleanup_fs;
2999 err = unshare_userns(unshare_flags, &new_cred);
3000 if (err)
3001 goto bad_unshare_cleanup_fd;
3002 err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
3003 new_cred, new_fs);
3004 if (err)
3005 goto bad_unshare_cleanup_cred;
3006
3007 if (new_cred) {
3008 err = set_cred_ucounts(new_cred);
3009 if (err)
3010 goto bad_unshare_cleanup_cred;
3011 }
3012
3013 if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
3014 if (do_sysvsem) {
3015
3016
3017
3018 exit_sem(current);
3019 }
3020 if (unshare_flags & CLONE_NEWIPC) {
3021
3022 exit_shm(current);
3023 shm_init_task(current);
3024 }
3025
3026 if (new_nsproxy)
3027 switch_task_namespaces(current, new_nsproxy);
3028
3029 task_lock(current);
3030
3031 if (new_fs) {
3032 fs = current->fs;
3033 spin_lock(&fs->lock);
3034 current->fs = new_fs;
3035 if (--fs->users)
3036 new_fs = NULL;
3037 else
3038 new_fs = fs;
3039 spin_unlock(&fs->lock);
3040 }
3041
3042 if (new_fd) {
3043 fd = current->files;
3044 current->files = new_fd;
3045 new_fd = fd;
3046 }
3047
3048 task_unlock(current);
3049
3050 if (new_cred) {
3051
3052 commit_creds(new_cred);
3053 new_cred = NULL;
3054 }
3055 }
3056
3057 perf_event_namespaces(current);
3058
3059bad_unshare_cleanup_cred:
3060 if (new_cred)
3061 put_cred(new_cred);
3062bad_unshare_cleanup_fd:
3063 if (new_fd)
3064 put_files_struct(new_fd);
3065
3066bad_unshare_cleanup_fs:
3067 if (new_fs)
3068 free_fs_struct(new_fs);
3069
3070bad_unshare_out:
3071 return err;
3072}
3073
3074SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
3075{
3076 return ksys_unshare(unshare_flags);
3077}
3078
3079
3080
3081
3082
3083
3084
3085int unshare_files(void)
3086{
3087 struct task_struct *task = current;
3088 struct files_struct *old, *copy = NULL;
3089 int error;
3090
3091 error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, ©);
3092 if (error || !copy)
3093 return error;
3094
3095 old = task->files;
3096 task_lock(task);
3097 task->files = copy;
3098 task_unlock(task);
3099 put_files_struct(old);
3100 return 0;
3101}
3102
3103int sysctl_max_threads(struct ctl_table *table, int write,
3104 void *buffer, size_t *lenp, loff_t *ppos)
3105{
3106 struct ctl_table t;
3107 int ret;
3108 int threads = max_threads;
3109 int min = 1;
3110 int max = MAX_THREADS;
3111
3112 t = *table;
3113 t.data = &threads;
3114 t.extra1 = &min;
3115 t.extra2 = &max;
3116
3117 ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
3118 if (ret || !write)
3119 return ret;
3120
3121 max_threads = threads;
3122
3123 return 0;
3124}
3125