1
2
3
4
5
6
7
8
9
10
11
12
13
14
15#include <linux/anon_inodes.h>
16#include <linux/slab.h>
17#include <linux/sched/autogroup.h>
18#include <linux/sched/mm.h>
19#include <linux/sched/coredump.h>
20#include <linux/sched/user.h>
21#include <linux/sched/numa_balancing.h>
22#include <linux/sched/stat.h>
23#include <linux/sched/task.h>
24#include <linux/sched/task_stack.h>
25#include <linux/sched/cputime.h>
26#include <linux/seq_file.h>
27#include <linux/rtmutex.h>
28#include <linux/init.h>
29#include <linux/unistd.h>
30#include <linux/module.h>
31#include <linux/vmalloc.h>
32#include <linux/completion.h>
33#include <linux/personality.h>
34#include <linux/mempolicy.h>
35#include <linux/sem.h>
36#include <linux/file.h>
37#include <linux/fdtable.h>
38#include <linux/iocontext.h>
39#include <linux/key.h>
40#include <linux/binfmts.h>
41#include <linux/mman.h>
42#include <linux/mmu_notifier.h>
43#include <linux/fs.h>
44#include <linux/mm.h>
45#include <linux/vmacache.h>
46#include <linux/nsproxy.h>
47#include <linux/capability.h>
48#include <linux/cpu.h>
49#include <linux/cgroup.h>
50#include <linux/security.h>
51#include <linux/hugetlb.h>
52#include <linux/seccomp.h>
53#include <linux/swap.h>
54#include <linux/syscalls.h>
55#include <linux/jiffies.h>
56#include <linux/futex.h>
57#include <linux/compat.h>
58#include <linux/kthread.h>
59#include <linux/task_io_accounting_ops.h>
60#include <linux/rcupdate.h>
61#include <linux/ptrace.h>
62#include <linux/mount.h>
63#include <linux/audit.h>
64#include <linux/memcontrol.h>
65#include <linux/ftrace.h>
66#include <linux/proc_fs.h>
67#include <linux/profile.h>
68#include <linux/rmap.h>
69#include <linux/ksm.h>
70#include <linux/acct.h>
71#include <linux/userfaultfd_k.h>
72#include <linux/tsacct_kern.h>
73#include <linux/cn_proc.h>
74#include <linux/freezer.h>
75#include <linux/delayacct.h>
76#include <linux/taskstats_kern.h>
77#include <linux/random.h>
78#include <linux/tty.h>
79#include <linux/blkdev.h>
80#include <linux/fs_struct.h>
81#include <linux/magic.h>
82#include <linux/perf_event.h>
83#include <linux/posix-timers.h>
84#include <linux/user-return-notifier.h>
85#include <linux/oom.h>
86#include <linux/khugepaged.h>
87#include <linux/signalfd.h>
88#include <linux/uprobes.h>
89#include <linux/aio.h>
90#include <linux/compiler.h>
91#include <linux/sysctl.h>
92#include <linux/kcov.h>
93#include <linux/livepatch.h>
94#include <linux/thread_info.h>
95#include <linux/stackleak.h>
96#include <linux/kasan.h>
97
98#include <asm/pgtable.h>
99#include <asm/pgalloc.h>
100#include <linux/uaccess.h>
101#include <asm/mmu_context.h>
102#include <asm/cacheflush.h>
103#include <asm/tlbflush.h>
104
105#include <trace/events/sched.h>
106
107#define CREATE_TRACE_POINTS
108#include <trace/events/task.h>
109
110
111
112
113#define MIN_THREADS 20
114
115
116
117
118#define MAX_THREADS FUTEX_TID_MASK
119
120
121
122
123unsigned long total_forks;
124int nr_threads;
125
126static int max_threads;
127
128#define NAMED_ARRAY_INDEX(x) [x] = __stringify(x)
129
130static const char * const resident_page_types[] = {
131 NAMED_ARRAY_INDEX(MM_FILEPAGES),
132 NAMED_ARRAY_INDEX(MM_ANONPAGES),
133 NAMED_ARRAY_INDEX(MM_SWAPENTS),
134 NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
135};
136
137DEFINE_PER_CPU(unsigned long, process_counts) = 0;
138
139__cacheline_aligned DEFINE_RWLOCK(tasklist_lock);
140
141#ifdef CONFIG_PROVE_RCU
142int lockdep_tasklist_lock_is_held(void)
143{
144 return lockdep_is_held(&tasklist_lock);
145}
146EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
147#endif
148
149int nr_processes(void)
150{
151 int cpu;
152 int total = 0;
153
154 for_each_possible_cpu(cpu)
155 total += per_cpu(process_counts, cpu);
156
157 return total;
158}
159
160void __weak arch_release_task_struct(struct task_struct *tsk)
161{
162}
163
164#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
165static struct kmem_cache *task_struct_cachep;
166
167static inline struct task_struct *alloc_task_struct_node(int node)
168{
169 return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
170}
171
172static inline void free_task_struct(struct task_struct *tsk)
173{
174 kmem_cache_free(task_struct_cachep, tsk);
175}
176#endif
177
178#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
179
180
181
182
183
184# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
185
186#ifdef CONFIG_VMAP_STACK
187
188
189
190
191#define NR_CACHED_STACKS 2
192static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
193
194static int free_vm_stack_cache(unsigned int cpu)
195{
196 struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
197 int i;
198
199 for (i = 0; i < NR_CACHED_STACKS; i++) {
200 struct vm_struct *vm_stack = cached_vm_stacks[i];
201
202 if (!vm_stack)
203 continue;
204
205 vfree(vm_stack->addr);
206 cached_vm_stacks[i] = NULL;
207 }
208
209 return 0;
210}
211#endif
212
213static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
214{
215#ifdef CONFIG_VMAP_STACK
216 void *stack;
217 int i;
218
219 for (i = 0; i < NR_CACHED_STACKS; i++) {
220 struct vm_struct *s;
221
222 s = this_cpu_xchg(cached_stacks[i], NULL);
223
224 if (!s)
225 continue;
226
227
228 kasan_unpoison_shadow(s->addr, THREAD_SIZE);
229
230
231 memset(s->addr, 0, THREAD_SIZE);
232
233 tsk->stack_vm_area = s;
234 tsk->stack = s->addr;
235 return s->addr;
236 }
237
238
239
240
241
242
243 stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
244 VMALLOC_START, VMALLOC_END,
245 THREADINFO_GFP & ~__GFP_ACCOUNT,
246 PAGE_KERNEL,
247 0, node, __builtin_return_address(0));
248
249
250
251
252
253
254 if (stack) {
255 tsk->stack_vm_area = find_vm_area(stack);
256 tsk->stack = stack;
257 }
258 return stack;
259#else
260 struct page *page = alloc_pages_node(node, THREADINFO_GFP,
261 THREAD_SIZE_ORDER);
262
263 if (likely(page)) {
264 tsk->stack = page_address(page);
265 return tsk->stack;
266 }
267 return NULL;
268#endif
269}
270
271static inline void free_thread_stack(struct task_struct *tsk)
272{
273#ifdef CONFIG_VMAP_STACK
274 struct vm_struct *vm = task_stack_vm_area(tsk);
275
276 if (vm) {
277 int i;
278
279 for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
280 mod_memcg_page_state(vm->pages[i],
281 MEMCG_KERNEL_STACK_KB,
282 -(int)(PAGE_SIZE / 1024));
283
284 memcg_kmem_uncharge(vm->pages[i], 0);
285 }
286
287 for (i = 0; i < NR_CACHED_STACKS; i++) {
288 if (this_cpu_cmpxchg(cached_stacks[i],
289 NULL, tsk->stack_vm_area) != NULL)
290 continue;
291
292 return;
293 }
294
295 vfree_atomic(tsk->stack);
296 return;
297 }
298#endif
299
300 __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
301}
302# else
303static struct kmem_cache *thread_stack_cache;
304
305static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
306 int node)
307{
308 unsigned long *stack;
309 stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
310 tsk->stack = stack;
311 return stack;
312}
313
314static void free_thread_stack(struct task_struct *tsk)
315{
316 kmem_cache_free(thread_stack_cache, tsk->stack);
317}
318
319void thread_stack_cache_init(void)
320{
321 thread_stack_cache = kmem_cache_create_usercopy("thread_stack",
322 THREAD_SIZE, THREAD_SIZE, 0, 0,
323 THREAD_SIZE, NULL);
324 BUG_ON(thread_stack_cache == NULL);
325}
326# endif
327#endif
328
329
330static struct kmem_cache *signal_cachep;
331
332
333struct kmem_cache *sighand_cachep;
334
335
336struct kmem_cache *files_cachep;
337
338
339struct kmem_cache *fs_cachep;
340
341
342static struct kmem_cache *vm_area_cachep;
343
344
345static struct kmem_cache *mm_cachep;
346
347struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
348{
349 struct vm_area_struct *vma;
350
351 vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
352 if (vma)
353 vma_init(vma, mm);
354 return vma;
355}
356
357struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
358{
359 struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
360
361 if (new) {
362 *new = *orig;
363 INIT_LIST_HEAD(&new->anon_vma_chain);
364 }
365 return new;
366}
367
368void vm_area_free(struct vm_area_struct *vma)
369{
370 kmem_cache_free(vm_area_cachep, vma);
371}
372
373static void account_kernel_stack(struct task_struct *tsk, int account)
374{
375 void *stack = task_stack_page(tsk);
376 struct vm_struct *vm = task_stack_vm_area(tsk);
377
378 BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
379
380 if (vm) {
381 int i;
382
383 BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
384
385 for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
386 mod_zone_page_state(page_zone(vm->pages[i]),
387 NR_KERNEL_STACK_KB,
388 PAGE_SIZE / 1024 * account);
389 }
390 } else {
391
392
393
394
395 struct page *first_page = virt_to_page(stack);
396
397 mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
398 THREAD_SIZE / 1024 * account);
399
400 mod_memcg_obj_state(stack, MEMCG_KERNEL_STACK_KB,
401 account * (THREAD_SIZE / 1024));
402 }
403}
404
405static int memcg_charge_kernel_stack(struct task_struct *tsk)
406{
407#ifdef CONFIG_VMAP_STACK
408 struct vm_struct *vm = task_stack_vm_area(tsk);
409 int ret;
410
411 if (vm) {
412 int i;
413
414 for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
415
416
417
418
419
420
421 ret = memcg_kmem_charge(vm->pages[i], GFP_KERNEL, 0);
422 if (ret)
423 return ret;
424
425 mod_memcg_page_state(vm->pages[i],
426 MEMCG_KERNEL_STACK_KB,
427 PAGE_SIZE / 1024);
428 }
429 }
430#endif
431 return 0;
432}
433
434static void release_task_stack(struct task_struct *tsk)
435{
436 if (WARN_ON(tsk->state != TASK_DEAD))
437 return;
438
439 account_kernel_stack(tsk, -1);
440 free_thread_stack(tsk);
441 tsk->stack = NULL;
442#ifdef CONFIG_VMAP_STACK
443 tsk->stack_vm_area = NULL;
444#endif
445}
446
447#ifdef CONFIG_THREAD_INFO_IN_TASK
448void put_task_stack(struct task_struct *tsk)
449{
450 if (refcount_dec_and_test(&tsk->stack_refcount))
451 release_task_stack(tsk);
452}
453#endif
454
455void free_task(struct task_struct *tsk)
456{
457#ifndef CONFIG_THREAD_INFO_IN_TASK
458
459
460
461
462 release_task_stack(tsk);
463#else
464
465
466
467
468 WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
469#endif
470 rt_mutex_debug_task_free(tsk);
471 ftrace_graph_exit_task(tsk);
472 put_seccomp_filter(tsk);
473 arch_release_task_struct(tsk);
474 if (tsk->flags & PF_KTHREAD)
475 free_kthread_struct(tsk);
476 free_task_struct(tsk);
477}
478EXPORT_SYMBOL(free_task);
479
480#ifdef CONFIG_MMU
481static __latent_entropy int dup_mmap(struct mm_struct *mm,
482 struct mm_struct *oldmm)
483{
484 struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
485 struct rb_node **rb_link, *rb_parent;
486 int retval;
487 unsigned long charge;
488 LIST_HEAD(uf);
489
490 uprobe_start_dup_mmap();
491 if (down_write_killable(&oldmm->mmap_sem)) {
492 retval = -EINTR;
493 goto fail_uprobe_end;
494 }
495 flush_cache_dup_mm(oldmm);
496 uprobe_dup_mmap(oldmm, mm);
497
498
499
500 down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
501
502
503 RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
504
505 mm->total_vm = oldmm->total_vm;
506 mm->data_vm = oldmm->data_vm;
507 mm->exec_vm = oldmm->exec_vm;
508 mm->stack_vm = oldmm->stack_vm;
509
510 rb_link = &mm->mm_rb.rb_node;
511 rb_parent = NULL;
512 pprev = &mm->mmap;
513 retval = ksm_fork(mm, oldmm);
514 if (retval)
515 goto out;
516 retval = khugepaged_fork(mm, oldmm);
517 if (retval)
518 goto out;
519
520 prev = NULL;
521 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
522 struct file *file;
523
524 if (mpnt->vm_flags & VM_DONTCOPY) {
525 vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
526 continue;
527 }
528 charge = 0;
529
530
531
532
533 if (fatal_signal_pending(current)) {
534 retval = -EINTR;
535 goto out;
536 }
537 if (mpnt->vm_flags & VM_ACCOUNT) {
538 unsigned long len = vma_pages(mpnt);
539
540 if (security_vm_enough_memory_mm(oldmm, len))
541 goto fail_nomem;
542 charge = len;
543 }
544 tmp = vm_area_dup(mpnt);
545 if (!tmp)
546 goto fail_nomem;
547 retval = vma_dup_policy(mpnt, tmp);
548 if (retval)
549 goto fail_nomem_policy;
550 tmp->vm_mm = mm;
551 retval = dup_userfaultfd(tmp, &uf);
552 if (retval)
553 goto fail_nomem_anon_vma_fork;
554 if (tmp->vm_flags & VM_WIPEONFORK) {
555
556 tmp->anon_vma = NULL;
557 if (anon_vma_prepare(tmp))
558 goto fail_nomem_anon_vma_fork;
559 } else if (anon_vma_fork(tmp, mpnt))
560 goto fail_nomem_anon_vma_fork;
561 tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
562 tmp->vm_next = tmp->vm_prev = NULL;
563 file = tmp->vm_file;
564 if (file) {
565 struct inode *inode = file_inode(file);
566 struct address_space *mapping = file->f_mapping;
567
568 get_file(file);
569 if (tmp->vm_flags & VM_DENYWRITE)
570 atomic_dec(&inode->i_writecount);
571 i_mmap_lock_write(mapping);
572 if (tmp->vm_flags & VM_SHARED)
573 atomic_inc(&mapping->i_mmap_writable);
574 flush_dcache_mmap_lock(mapping);
575
576 vma_interval_tree_insert_after(tmp, mpnt,
577 &mapping->i_mmap);
578 flush_dcache_mmap_unlock(mapping);
579 i_mmap_unlock_write(mapping);
580 }
581
582
583
584
585
586
587 if (is_vm_hugetlb_page(tmp))
588 reset_vma_resv_huge_pages(tmp);
589
590
591
592
593 *pprev = tmp;
594 pprev = &tmp->vm_next;
595 tmp->vm_prev = prev;
596 prev = tmp;
597
598 __vma_link_rb(mm, tmp, rb_link, rb_parent);
599 rb_link = &tmp->vm_rb.rb_right;
600 rb_parent = &tmp->vm_rb;
601
602 mm->map_count++;
603 if (!(tmp->vm_flags & VM_WIPEONFORK))
604 retval = copy_page_range(mm, oldmm, mpnt);
605
606 if (tmp->vm_ops && tmp->vm_ops->open)
607 tmp->vm_ops->open(tmp);
608
609 if (retval)
610 goto out;
611 }
612
613 retval = arch_dup_mmap(oldmm, mm);
614out:
615 up_write(&mm->mmap_sem);
616 flush_tlb_mm(oldmm);
617 up_write(&oldmm->mmap_sem);
618 dup_userfaultfd_complete(&uf);
619fail_uprobe_end:
620 uprobe_end_dup_mmap();
621 return retval;
622fail_nomem_anon_vma_fork:
623 mpol_put(vma_policy(tmp));
624fail_nomem_policy:
625 vm_area_free(tmp);
626fail_nomem:
627 retval = -ENOMEM;
628 vm_unacct_memory(charge);
629 goto out;
630}
631
632static inline int mm_alloc_pgd(struct mm_struct *mm)
633{
634 mm->pgd = pgd_alloc(mm);
635 if (unlikely(!mm->pgd))
636 return -ENOMEM;
637 return 0;
638}
639
640static inline void mm_free_pgd(struct mm_struct *mm)
641{
642 pgd_free(mm, mm->pgd);
643}
644#else
645static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
646{
647 down_write(&oldmm->mmap_sem);
648 RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
649 up_write(&oldmm->mmap_sem);
650 return 0;
651}
652#define mm_alloc_pgd(mm) (0)
653#define mm_free_pgd(mm)
654#endif
655
656static void check_mm(struct mm_struct *mm)
657{
658 int i;
659
660 BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
661 "Please make sure 'struct resident_page_types[]' is updated as well");
662
663 for (i = 0; i < NR_MM_COUNTERS; i++) {
664 long x = atomic_long_read(&mm->rss_stat.count[i]);
665
666 if (unlikely(x))
667 pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
668 mm, resident_page_types[i], x);
669 }
670
671 if (mm_pgtables_bytes(mm))
672 pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
673 mm_pgtables_bytes(mm));
674
675#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
676 VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
677#endif
678}
679
680#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
681#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
682
683
684
685
686
687
688void __mmdrop(struct mm_struct *mm)
689{
690 BUG_ON(mm == &init_mm);
691 WARN_ON_ONCE(mm == current->mm);
692 WARN_ON_ONCE(mm == current->active_mm);
693 mm_free_pgd(mm);
694 destroy_context(mm);
695 mmu_notifier_subscriptions_destroy(mm);
696 check_mm(mm);
697 put_user_ns(mm->user_ns);
698 free_mm(mm);
699}
700EXPORT_SYMBOL_GPL(__mmdrop);
701
702static void mmdrop_async_fn(struct work_struct *work)
703{
704 struct mm_struct *mm;
705
706 mm = container_of(work, struct mm_struct, async_put_work);
707 __mmdrop(mm);
708}
709
710static void mmdrop_async(struct mm_struct *mm)
711{
712 if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
713 INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
714 schedule_work(&mm->async_put_work);
715 }
716}
717
718static inline void free_signal_struct(struct signal_struct *sig)
719{
720 taskstats_tgid_free(sig);
721 sched_autogroup_exit(sig);
722
723
724
725
726 if (sig->oom_mm)
727 mmdrop_async(sig->oom_mm);
728 kmem_cache_free(signal_cachep, sig);
729}
730
731static inline void put_signal_struct(struct signal_struct *sig)
732{
733 if (refcount_dec_and_test(&sig->sigcnt))
734 free_signal_struct(sig);
735}
736
737void __put_task_struct(struct task_struct *tsk)
738{
739 WARN_ON(!tsk->exit_state);
740 WARN_ON(refcount_read(&tsk->usage));
741 WARN_ON(tsk == current);
742
743 cgroup_free(tsk);
744 task_numa_free(tsk, true);
745 security_task_free(tsk);
746 exit_creds(tsk);
747 delayacct_tsk_free(tsk);
748 put_signal_struct(tsk->signal);
749
750 if (!profile_handoff_task(tsk))
751 free_task(tsk);
752}
753EXPORT_SYMBOL_GPL(__put_task_struct);
754
755void __init __weak arch_task_cache_init(void) { }
756
757
758
759
760static void set_max_threads(unsigned int max_threads_suggested)
761{
762 u64 threads;
763 unsigned long nr_pages = totalram_pages();
764
765
766
767
768
769 if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64)
770 threads = MAX_THREADS;
771 else
772 threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE,
773 (u64) THREAD_SIZE * 8UL);
774
775 if (threads > max_threads_suggested)
776 threads = max_threads_suggested;
777
778 max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
779}
780
781#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
782
783int arch_task_struct_size __read_mostly;
784#endif
785
786#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
787static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
788{
789
790 arch_thread_struct_whitelist(offset, size);
791
792
793
794
795
796 if (unlikely(*size == 0))
797 *offset = 0;
798 else
799 *offset += offsetof(struct task_struct, thread);
800}
801#endif
802
803void __init fork_init(void)
804{
805 int i;
806#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
807#ifndef ARCH_MIN_TASKALIGN
808#define ARCH_MIN_TASKALIGN 0
809#endif
810 int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
811 unsigned long useroffset, usersize;
812
813
814 task_struct_whitelist(&useroffset, &usersize);
815 task_struct_cachep = kmem_cache_create_usercopy("task_struct",
816 arch_task_struct_size, align,
817 SLAB_PANIC|SLAB_ACCOUNT,
818 useroffset, usersize, NULL);
819#endif
820
821
822 arch_task_cache_init();
823
824 set_max_threads(MAX_THREADS);
825
826 init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
827 init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
828 init_task.signal->rlim[RLIMIT_SIGPENDING] =
829 init_task.signal->rlim[RLIMIT_NPROC];
830
831 for (i = 0; i < UCOUNT_COUNTS; i++) {
832 init_user_ns.ucount_max[i] = max_threads/2;
833 }
834
835#ifdef CONFIG_VMAP_STACK
836 cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
837 NULL, free_vm_stack_cache);
838#endif
839
840 lockdep_init_task(&init_task);
841 uprobes_init();
842}
843
844int __weak arch_dup_task_struct(struct task_struct *dst,
845 struct task_struct *src)
846{
847 *dst = *src;
848 return 0;
849}
850
851void set_task_stack_end_magic(struct task_struct *tsk)
852{
853 unsigned long *stackend;
854
855 stackend = end_of_stack(tsk);
856 *stackend = STACK_END_MAGIC;
857}
858
859static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
860{
861 struct task_struct *tsk;
862 unsigned long *stack;
863 struct vm_struct *stack_vm_area __maybe_unused;
864 int err;
865
866 if (node == NUMA_NO_NODE)
867 node = tsk_fork_get_node(orig);
868 tsk = alloc_task_struct_node(node);
869 if (!tsk)
870 return NULL;
871
872 stack = alloc_thread_stack_node(tsk, node);
873 if (!stack)
874 goto free_tsk;
875
876 if (memcg_charge_kernel_stack(tsk))
877 goto free_stack;
878
879 stack_vm_area = task_stack_vm_area(tsk);
880
881 err = arch_dup_task_struct(tsk, orig);
882
883
884
885
886
887
888 tsk->stack = stack;
889#ifdef CONFIG_VMAP_STACK
890 tsk->stack_vm_area = stack_vm_area;
891#endif
892#ifdef CONFIG_THREAD_INFO_IN_TASK
893 refcount_set(&tsk->stack_refcount, 1);
894#endif
895
896 if (err)
897 goto free_stack;
898
899#ifdef CONFIG_SECCOMP
900
901
902
903
904
905
906 tsk->seccomp.filter = NULL;
907#endif
908
909 setup_thread_stack(tsk, orig);
910 clear_user_return_notifier(tsk);
911 clear_tsk_need_resched(tsk);
912 set_task_stack_end_magic(tsk);
913
914#ifdef CONFIG_STACKPROTECTOR
915 tsk->stack_canary = get_random_canary();
916#endif
917 if (orig->cpus_ptr == &orig->cpus_mask)
918 tsk->cpus_ptr = &tsk->cpus_mask;
919
920
921
922
923
924 refcount_set(&tsk->rcu_users, 2);
925
926 refcount_set(&tsk->usage, 1);
927#ifdef CONFIG_BLK_DEV_IO_TRACE
928 tsk->btrace_seq = 0;
929#endif
930 tsk->splice_pipe = NULL;
931 tsk->task_frag.page = NULL;
932 tsk->wake_q.next = NULL;
933
934 account_kernel_stack(tsk, 1);
935
936 kcov_task_init(tsk);
937
938#ifdef CONFIG_FAULT_INJECTION
939 tsk->fail_nth = 0;
940#endif
941
942#ifdef CONFIG_BLK_CGROUP
943 tsk->throttle_queue = NULL;
944 tsk->use_memdelay = 0;
945#endif
946
947#ifdef CONFIG_MEMCG
948 tsk->active_memcg = NULL;
949#endif
950 return tsk;
951
952free_stack:
953 free_thread_stack(tsk);
954free_tsk:
955 free_task_struct(tsk);
956 return NULL;
957}
958
959__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
960
961static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
962
963static int __init coredump_filter_setup(char *s)
964{
965 default_dump_filter =
966 (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
967 MMF_DUMP_FILTER_MASK;
968 return 1;
969}
970
971__setup("coredump_filter=", coredump_filter_setup);
972
973#include <linux/init_task.h>
974
975static void mm_init_aio(struct mm_struct *mm)
976{
977#ifdef CONFIG_AIO
978 spin_lock_init(&mm->ioctx_lock);
979 mm->ioctx_table = NULL;
980#endif
981}
982
983static __always_inline void mm_clear_owner(struct mm_struct *mm,
984 struct task_struct *p)
985{
986#ifdef CONFIG_MEMCG
987 if (mm->owner == p)
988 WRITE_ONCE(mm->owner, NULL);
989#endif
990}
991
992static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
993{
994#ifdef CONFIG_MEMCG
995 mm->owner = p;
996#endif
997}
998
999static void mm_init_uprobes_state(struct mm_struct *mm)
1000{
1001#ifdef CONFIG_UPROBES
1002 mm->uprobes_state.xol_area = NULL;
1003#endif
1004}
1005
1006static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
1007 struct user_namespace *user_ns)
1008{
1009 mm->mmap = NULL;
1010 mm->mm_rb = RB_ROOT;
1011 mm->vmacache_seqnum = 0;
1012 atomic_set(&mm->mm_users, 1);
1013 atomic_set(&mm->mm_count, 1);
1014 init_rwsem(&mm->mmap_sem);
1015 INIT_LIST_HEAD(&mm->mmlist);
1016 mm->core_state = NULL;
1017 mm_pgtables_bytes_init(mm);
1018 mm->map_count = 0;
1019 mm->locked_vm = 0;
1020 atomic64_set(&mm->pinned_vm, 0);
1021 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
1022 spin_lock_init(&mm->page_table_lock);
1023 spin_lock_init(&mm->arg_lock);
1024 mm_init_cpumask(mm);
1025 mm_init_aio(mm);
1026 mm_init_owner(mm, p);
1027 RCU_INIT_POINTER(mm->exe_file, NULL);
1028 mmu_notifier_subscriptions_init(mm);
1029 init_tlb_flush_pending(mm);
1030#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
1031 mm->pmd_huge_pte = NULL;
1032#endif
1033 mm_init_uprobes_state(mm);
1034
1035 if (current->mm) {
1036 mm->flags = current->mm->flags & MMF_INIT_MASK;
1037 mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
1038 } else {
1039 mm->flags = default_dump_filter;
1040 mm->def_flags = 0;
1041 }
1042
1043 if (mm_alloc_pgd(mm))
1044 goto fail_nopgd;
1045
1046 if (init_new_context(p, mm))
1047 goto fail_nocontext;
1048
1049 mm->user_ns = get_user_ns(user_ns);
1050 return mm;
1051
1052fail_nocontext:
1053 mm_free_pgd(mm);
1054fail_nopgd:
1055 free_mm(mm);
1056 return NULL;
1057}
1058
1059
1060
1061
1062struct mm_struct *mm_alloc(void)
1063{
1064 struct mm_struct *mm;
1065
1066 mm = allocate_mm();
1067 if (!mm)
1068 return NULL;
1069
1070 memset(mm, 0, sizeof(*mm));
1071 return mm_init(mm, current, current_user_ns());
1072}
1073
1074static inline void __mmput(struct mm_struct *mm)
1075{
1076 VM_BUG_ON(atomic_read(&mm->mm_users));
1077
1078 uprobe_clear_state(mm);
1079 exit_aio(mm);
1080 ksm_exit(mm);
1081 khugepaged_exit(mm);
1082 exit_mmap(mm);
1083 mm_put_huge_zero_page(mm);
1084 set_mm_exe_file(mm, NULL);
1085 if (!list_empty(&mm->mmlist)) {
1086 spin_lock(&mmlist_lock);
1087 list_del(&mm->mmlist);
1088 spin_unlock(&mmlist_lock);
1089 }
1090 if (mm->binfmt)
1091 module_put(mm->binfmt->module);
1092 mmdrop(mm);
1093}
1094
1095
1096
1097
1098void mmput(struct mm_struct *mm)
1099{
1100 might_sleep();
1101
1102 if (atomic_dec_and_test(&mm->mm_users))
1103 __mmput(mm);
1104}
1105EXPORT_SYMBOL_GPL(mmput);
1106
1107#ifdef CONFIG_MMU
1108static void mmput_async_fn(struct work_struct *work)
1109{
1110 struct mm_struct *mm = container_of(work, struct mm_struct,
1111 async_put_work);
1112
1113 __mmput(mm);
1114}
1115
1116void mmput_async(struct mm_struct *mm)
1117{
1118 if (atomic_dec_and_test(&mm->mm_users)) {
1119 INIT_WORK(&mm->async_put_work, mmput_async_fn);
1120 schedule_work(&mm->async_put_work);
1121 }
1122}
1123#endif
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
1137{
1138 struct file *old_exe_file;
1139
1140
1141
1142
1143
1144
1145 old_exe_file = rcu_dereference_raw(mm->exe_file);
1146
1147 if (new_exe_file)
1148 get_file(new_exe_file);
1149 rcu_assign_pointer(mm->exe_file, new_exe_file);
1150 if (old_exe_file)
1151 fput(old_exe_file);
1152}
1153
1154
1155
1156
1157
1158
1159
1160struct file *get_mm_exe_file(struct mm_struct *mm)
1161{
1162 struct file *exe_file;
1163
1164 rcu_read_lock();
1165 exe_file = rcu_dereference(mm->exe_file);
1166 if (exe_file && !get_file_rcu(exe_file))
1167 exe_file = NULL;
1168 rcu_read_unlock();
1169 return exe_file;
1170}
1171EXPORT_SYMBOL(get_mm_exe_file);
1172
1173
1174
1175
1176
1177
1178
1179
1180struct file *get_task_exe_file(struct task_struct *task)
1181{
1182 struct file *exe_file = NULL;
1183 struct mm_struct *mm;
1184
1185 task_lock(task);
1186 mm = task->mm;
1187 if (mm) {
1188 if (!(task->flags & PF_KTHREAD))
1189 exe_file = get_mm_exe_file(mm);
1190 }
1191 task_unlock(task);
1192 return exe_file;
1193}
1194EXPORT_SYMBOL(get_task_exe_file);
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205struct mm_struct *get_task_mm(struct task_struct *task)
1206{
1207 struct mm_struct *mm;
1208
1209 task_lock(task);
1210 mm = task->mm;
1211 if (mm) {
1212 if (task->flags & PF_KTHREAD)
1213 mm = NULL;
1214 else
1215 mmget(mm);
1216 }
1217 task_unlock(task);
1218 return mm;
1219}
1220EXPORT_SYMBOL_GPL(get_task_mm);
1221
1222struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
1223{
1224 struct mm_struct *mm;
1225 int err;
1226
1227 err = mutex_lock_killable(&task->signal->cred_guard_mutex);
1228 if (err)
1229 return ERR_PTR(err);
1230
1231 mm = get_task_mm(task);
1232 if (mm && mm != current->mm &&
1233 !ptrace_may_access(task, mode)) {
1234 mmput(mm);
1235 mm = ERR_PTR(-EACCES);
1236 }
1237 mutex_unlock(&task->signal->cred_guard_mutex);
1238
1239 return mm;
1240}
1241
1242static void complete_vfork_done(struct task_struct *tsk)
1243{
1244 struct completion *vfork;
1245
1246 task_lock(tsk);
1247 vfork = tsk->vfork_done;
1248 if (likely(vfork)) {
1249 tsk->vfork_done = NULL;
1250 complete(vfork);
1251 }
1252 task_unlock(tsk);
1253}
1254
1255static int wait_for_vfork_done(struct task_struct *child,
1256 struct completion *vfork)
1257{
1258 int killed;
1259
1260 freezer_do_not_count();
1261 cgroup_enter_frozen();
1262 killed = wait_for_completion_killable(vfork);
1263 cgroup_leave_frozen(false);
1264 freezer_count();
1265
1266 if (killed) {
1267 task_lock(child);
1268 child->vfork_done = NULL;
1269 task_unlock(child);
1270 }
1271
1272 put_task_struct(child);
1273 return killed;
1274}
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
1290{
1291 uprobe_free_utask(tsk);
1292
1293
1294 deactivate_mm(tsk, mm);
1295
1296
1297
1298
1299
1300
1301 if (tsk->clear_child_tid) {
1302 if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) &&
1303 atomic_read(&mm->mm_users) > 1) {
1304
1305
1306
1307
1308 put_user(0, tsk->clear_child_tid);
1309 do_futex(tsk->clear_child_tid, FUTEX_WAKE,
1310 1, NULL, NULL, 0, 0);
1311 }
1312 tsk->clear_child_tid = NULL;
1313 }
1314
1315
1316
1317
1318
1319 if (tsk->vfork_done)
1320 complete_vfork_done(tsk);
1321}
1322
1323void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm)
1324{
1325 futex_exit_release(tsk);
1326 mm_release(tsk, mm);
1327}
1328
1329void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm)
1330{
1331 futex_exec_release(tsk);
1332 mm_release(tsk, mm);
1333}
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345static struct mm_struct *dup_mm(struct task_struct *tsk,
1346 struct mm_struct *oldmm)
1347{
1348 struct mm_struct *mm;
1349 int err;
1350
1351 mm = allocate_mm();
1352 if (!mm)
1353 goto fail_nomem;
1354
1355 memcpy(mm, oldmm, sizeof(*mm));
1356
1357 if (!mm_init(mm, tsk, mm->user_ns))
1358 goto fail_nomem;
1359
1360 err = dup_mmap(mm, oldmm);
1361 if (err)
1362 goto free_pt;
1363
1364 mm->hiwater_rss = get_mm_rss(mm);
1365 mm->hiwater_vm = mm->total_vm;
1366
1367 if (mm->binfmt && !try_module_get(mm->binfmt->module))
1368 goto free_pt;
1369
1370 return mm;
1371
1372free_pt:
1373
1374 mm->binfmt = NULL;
1375 mm_init_owner(mm, NULL);
1376 mmput(mm);
1377
1378fail_nomem:
1379 return NULL;
1380}
1381
1382static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
1383{
1384 struct mm_struct *mm, *oldmm;
1385 int retval;
1386
1387 tsk->min_flt = tsk->maj_flt = 0;
1388 tsk->nvcsw = tsk->nivcsw = 0;
1389#ifdef CONFIG_DETECT_HUNG_TASK
1390 tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
1391 tsk->last_switch_time = 0;
1392#endif
1393
1394 tsk->mm = NULL;
1395 tsk->active_mm = NULL;
1396
1397
1398
1399
1400
1401
1402 oldmm = current->mm;
1403 if (!oldmm)
1404 return 0;
1405
1406
1407 vmacache_flush(tsk);
1408
1409 if (clone_flags & CLONE_VM) {
1410 mmget(oldmm);
1411 mm = oldmm;
1412 goto good_mm;
1413 }
1414
1415 retval = -ENOMEM;
1416 mm = dup_mm(tsk, current->mm);
1417 if (!mm)
1418 goto fail_nomem;
1419
1420good_mm:
1421 tsk->mm = mm;
1422 tsk->active_mm = mm;
1423 return 0;
1424
1425fail_nomem:
1426 return retval;
1427}
1428
1429static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
1430{
1431 struct fs_struct *fs = current->fs;
1432 if (clone_flags & CLONE_FS) {
1433
1434 spin_lock(&fs->lock);
1435 if (fs->in_exec) {
1436 spin_unlock(&fs->lock);
1437 return -EAGAIN;
1438 }
1439 fs->users++;
1440 spin_unlock(&fs->lock);
1441 return 0;
1442 }
1443 tsk->fs = copy_fs_struct(fs);
1444 if (!tsk->fs)
1445 return -ENOMEM;
1446 return 0;
1447}
1448
1449static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
1450{
1451 struct files_struct *oldf, *newf;
1452 int error = 0;
1453
1454
1455
1456
1457 oldf = current->files;
1458 if (!oldf)
1459 goto out;
1460
1461 if (clone_flags & CLONE_FILES) {
1462 atomic_inc(&oldf->count);
1463 goto out;
1464 }
1465
1466 newf = dup_fd(oldf, &error);
1467 if (!newf)
1468 goto out;
1469
1470 tsk->files = newf;
1471 error = 0;
1472out:
1473 return error;
1474}
1475
1476static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
1477{
1478#ifdef CONFIG_BLOCK
1479 struct io_context *ioc = current->io_context;
1480 struct io_context *new_ioc;
1481
1482 if (!ioc)
1483 return 0;
1484
1485
1486
1487 if (clone_flags & CLONE_IO) {
1488 ioc_task_link(ioc);
1489 tsk->io_context = ioc;
1490 } else if (ioprio_valid(ioc->ioprio)) {
1491 new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
1492 if (unlikely(!new_ioc))
1493 return -ENOMEM;
1494
1495 new_ioc->ioprio = ioc->ioprio;
1496 put_io_context(new_ioc);
1497 }
1498#endif
1499 return 0;
1500}
1501
1502static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
1503{
1504 struct sighand_struct *sig;
1505
1506 if (clone_flags & CLONE_SIGHAND) {
1507 refcount_inc(¤t->sighand->count);
1508 return 0;
1509 }
1510 sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
1511 RCU_INIT_POINTER(tsk->sighand, sig);
1512 if (!sig)
1513 return -ENOMEM;
1514
1515 refcount_set(&sig->count, 1);
1516 spin_lock_irq(¤t->sighand->siglock);
1517 memcpy(sig->action, current->sighand->action, sizeof(sig->action));
1518 spin_unlock_irq(¤t->sighand->siglock);
1519
1520
1521 if (clone_flags & CLONE_CLEAR_SIGHAND)
1522 flush_signal_handlers(tsk, 0);
1523
1524 return 0;
1525}
1526
1527void __cleanup_sighand(struct sighand_struct *sighand)
1528{
1529 if (refcount_dec_and_test(&sighand->count)) {
1530 signalfd_cleanup(sighand);
1531
1532
1533
1534
1535 kmem_cache_free(sighand_cachep, sighand);
1536 }
1537}
1538
1539
1540
1541
1542static void posix_cpu_timers_init_group(struct signal_struct *sig)
1543{
1544 struct posix_cputimers *pct = &sig->posix_cputimers;
1545 unsigned long cpu_limit;
1546
1547 cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
1548 posix_cputimers_group_init(pct, cpu_limit);
1549}
1550
1551static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1552{
1553 struct signal_struct *sig;
1554
1555 if (clone_flags & CLONE_THREAD)
1556 return 0;
1557
1558 sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
1559 tsk->signal = sig;
1560 if (!sig)
1561 return -ENOMEM;
1562
1563 sig->nr_threads = 1;
1564 atomic_set(&sig->live, 1);
1565 refcount_set(&sig->sigcnt, 1);
1566
1567
1568 sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
1569 tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head);
1570
1571 init_waitqueue_head(&sig->wait_chldexit);
1572 sig->curr_target = tsk;
1573 init_sigpending(&sig->shared_pending);
1574 INIT_HLIST_HEAD(&sig->multiprocess);
1575 seqlock_init(&sig->stats_lock);
1576 prev_cputime_init(&sig->prev_cputime);
1577
1578#ifdef CONFIG_POSIX_TIMERS
1579 INIT_LIST_HEAD(&sig->posix_timers);
1580 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1581 sig->real_timer.function = it_real_fn;
1582#endif
1583
1584 task_lock(current->group_leader);
1585 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
1586 task_unlock(current->group_leader);
1587
1588 posix_cpu_timers_init_group(sig);
1589
1590 tty_audit_fork(sig);
1591 sched_autogroup_fork(sig);
1592
1593 sig->oom_score_adj = current->signal->oom_score_adj;
1594 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
1595
1596 mutex_init(&sig->cred_guard_mutex);
1597
1598 return 0;
1599}
1600
1601static void copy_seccomp(struct task_struct *p)
1602{
1603#ifdef CONFIG_SECCOMP
1604
1605
1606
1607
1608
1609
1610 assert_spin_locked(¤t->sighand->siglock);
1611
1612
1613 get_seccomp_filter(current);
1614 p->seccomp = current->seccomp;
1615
1616
1617
1618
1619
1620
1621 if (task_no_new_privs(current))
1622 task_set_no_new_privs(p);
1623
1624
1625
1626
1627
1628
1629 if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
1630 set_tsk_thread_flag(p, TIF_SECCOMP);
1631#endif
1632}
1633
1634SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
1635{
1636 current->clear_child_tid = tidptr;
1637
1638 return task_pid_vnr(current);
1639}
1640
1641static void rt_mutex_init_task(struct task_struct *p)
1642{
1643 raw_spin_lock_init(&p->pi_lock);
1644#ifdef CONFIG_RT_MUTEXES
1645 p->pi_waiters = RB_ROOT_CACHED;
1646 p->pi_top_task = NULL;
1647 p->pi_blocked_on = NULL;
1648#endif
1649}
1650
1651static inline void init_task_pid_links(struct task_struct *task)
1652{
1653 enum pid_type type;
1654
1655 for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
1656 INIT_HLIST_NODE(&task->pid_links[type]);
1657 }
1658}
1659
1660static inline void
1661init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
1662{
1663 if (type == PIDTYPE_PID)
1664 task->thread_pid = pid;
1665 else
1666 task->signal->pids[type] = pid;
1667}
1668
1669static inline void rcu_copy_process(struct task_struct *p)
1670{
1671#ifdef CONFIG_PREEMPT_RCU
1672 p->rcu_read_lock_nesting = 0;
1673 p->rcu_read_unlock_special.s = 0;
1674 p->rcu_blocked_node = NULL;
1675 INIT_LIST_HEAD(&p->rcu_node_entry);
1676#endif
1677#ifdef CONFIG_TASKS_RCU
1678 p->rcu_tasks_holdout = false;
1679 INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
1680 p->rcu_tasks_idle_cpu = -1;
1681#endif
1682}
1683
1684struct pid *pidfd_pid(const struct file *file)
1685{
1686 if (file->f_op == &pidfd_fops)
1687 return file->private_data;
1688
1689 return ERR_PTR(-EBADF);
1690}
1691
1692static int pidfd_release(struct inode *inode, struct file *file)
1693{
1694 struct pid *pid = file->private_data;
1695
1696 file->private_data = NULL;
1697 put_pid(pid);
1698 return 0;
1699}
1700
1701#ifdef CONFIG_PROC_FS
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
1738{
1739 struct pid *pid = f->private_data;
1740 struct pid_namespace *ns;
1741 pid_t nr = -1;
1742
1743 if (likely(pid_has_task(pid, PIDTYPE_PID))) {
1744 ns = proc_pid_ns(file_inode(m->file));
1745 nr = pid_nr_ns(pid, ns);
1746 }
1747
1748 seq_put_decimal_ll(m, "Pid:\t", nr);
1749
1750#ifdef CONFIG_PID_NS
1751 seq_put_decimal_ll(m, "\nNSpid:\t", nr);
1752 if (nr > 0) {
1753 int i;
1754
1755
1756
1757
1758
1759
1760 for (i = ns->level + 1; i <= pid->level; i++)
1761 seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
1762 }
1763#endif
1764 seq_putc(m, '\n');
1765}
1766#endif
1767
1768
1769
1770
1771static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
1772{
1773 struct task_struct *task;
1774 struct pid *pid = file->private_data;
1775 __poll_t poll_flags = 0;
1776
1777 poll_wait(file, &pid->wait_pidfd, pts);
1778
1779 rcu_read_lock();
1780 task = pid_task(pid, PIDTYPE_PID);
1781
1782
1783
1784
1785
1786 if (!task || (task->exit_state && thread_group_empty(task)))
1787 poll_flags = EPOLLIN | EPOLLRDNORM;
1788 rcu_read_unlock();
1789
1790 return poll_flags;
1791}
1792
1793const struct file_operations pidfd_fops = {
1794 .release = pidfd_release,
1795 .poll = pidfd_poll,
1796#ifdef CONFIG_PROC_FS
1797 .show_fdinfo = pidfd_show_fdinfo,
1798#endif
1799};
1800
1801static void __delayed_free_task(struct rcu_head *rhp)
1802{
1803 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
1804
1805 free_task(tsk);
1806}
1807
1808static __always_inline void delayed_free_task(struct task_struct *tsk)
1809{
1810 if (IS_ENABLED(CONFIG_MEMCG))
1811 call_rcu(&tsk->rcu, __delayed_free_task);
1812 else
1813 free_task(tsk);
1814}
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824static __latent_entropy struct task_struct *copy_process(
1825 struct pid *pid,
1826 int trace,
1827 int node,
1828 struct kernel_clone_args *args)
1829{
1830 int pidfd = -1, retval;
1831 struct task_struct *p;
1832 struct multiprocess_signals delayed;
1833 struct file *pidfile = NULL;
1834 u64 clone_flags = args->flags;
1835 struct nsproxy *nsp = current->nsproxy;
1836
1837
1838
1839
1840
1841 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
1842 return ERR_PTR(-EINVAL);
1843
1844 if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
1845 return ERR_PTR(-EINVAL);
1846
1847
1848
1849
1850
1851 if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
1852 return ERR_PTR(-EINVAL);
1853
1854
1855
1856
1857
1858
1859 if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
1860 return ERR_PTR(-EINVAL);
1861
1862
1863
1864
1865
1866
1867
1868 if ((clone_flags & CLONE_PARENT) &&
1869 current->signal->flags & SIGNAL_UNKILLABLE)
1870 return ERR_PTR(-EINVAL);
1871
1872
1873
1874
1875
1876 if (clone_flags & CLONE_THREAD) {
1877 if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
1878 (task_active_pid_ns(current) != nsp->pid_ns_for_children))
1879 return ERR_PTR(-EINVAL);
1880 }
1881
1882
1883
1884
1885
1886 if (clone_flags & (CLONE_THREAD | CLONE_VM)) {
1887 if (nsp->time_ns != nsp->time_ns_for_children)
1888 return ERR_PTR(-EINVAL);
1889 }
1890
1891 if (clone_flags & CLONE_PIDFD) {
1892
1893
1894
1895
1896
1897 if (clone_flags & (CLONE_DETACHED | CLONE_THREAD))
1898 return ERR_PTR(-EINVAL);
1899 }
1900
1901
1902
1903
1904
1905
1906
1907 sigemptyset(&delayed.signal);
1908 INIT_HLIST_NODE(&delayed.node);
1909
1910 spin_lock_irq(¤t->sighand->siglock);
1911 if (!(clone_flags & CLONE_THREAD))
1912 hlist_add_head(&delayed.node, ¤t->signal->multiprocess);
1913 recalc_sigpending();
1914 spin_unlock_irq(¤t->sighand->siglock);
1915 retval = -ERESTARTNOINTR;
1916 if (signal_pending(current))
1917 goto fork_out;
1918
1919 retval = -ENOMEM;
1920 p = dup_task_struct(current, node);
1921 if (!p)
1922 goto fork_out;
1923
1924
1925
1926
1927
1928
1929
1930 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
1931
1932
1933
1934 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;
1935
1936 ftrace_graph_init_task(p);
1937
1938 rt_mutex_init_task(p);
1939
1940#ifdef CONFIG_PROVE_LOCKING
1941 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
1942 DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
1943#endif
1944 retval = -EAGAIN;
1945 if (atomic_read(&p->real_cred->user->processes) >=
1946 task_rlimit(p, RLIMIT_NPROC)) {
1947 if (p->real_cred->user != INIT_USER &&
1948 !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
1949 goto bad_fork_free;
1950 }
1951 current->flags &= ~PF_NPROC_EXCEEDED;
1952
1953 retval = copy_creds(p, clone_flags);
1954 if (retval < 0)
1955 goto bad_fork_free;
1956
1957
1958
1959
1960
1961
1962 retval = -EAGAIN;
1963 if (nr_threads >= max_threads)
1964 goto bad_fork_cleanup_count;
1965
1966 delayacct_tsk_init(p);
1967 p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE);
1968 p->flags |= PF_FORKNOEXEC;
1969 INIT_LIST_HEAD(&p->children);
1970 INIT_LIST_HEAD(&p->sibling);
1971 rcu_copy_process(p);
1972 p->vfork_done = NULL;
1973 spin_lock_init(&p->alloc_lock);
1974
1975 init_sigpending(&p->pending);
1976
1977 p->utime = p->stime = p->gtime = 0;
1978#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
1979 p->utimescaled = p->stimescaled = 0;
1980#endif
1981 prev_cputime_init(&p->prev_cputime);
1982
1983#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
1984 seqcount_init(&p->vtime.seqcount);
1985 p->vtime.starttime = 0;
1986 p->vtime.state = VTIME_INACTIVE;
1987#endif
1988
1989#if defined(SPLIT_RSS_COUNTING)
1990 memset(&p->rss_stat, 0, sizeof(p->rss_stat));
1991#endif
1992
1993 p->default_timer_slack_ns = current->timer_slack_ns;
1994
1995#ifdef CONFIG_PSI
1996 p->psi_flags = 0;
1997#endif
1998
1999 task_io_accounting_init(&p->ioac);
2000 acct_clear_integrals(p);
2001
2002 posix_cputimers_init(&p->posix_cputimers);
2003
2004 p->io_context = NULL;
2005 audit_set_context(p, NULL);
2006 cgroup_fork(p);
2007#ifdef CONFIG_NUMA
2008 p->mempolicy = mpol_dup(p->mempolicy);
2009 if (IS_ERR(p->mempolicy)) {
2010 retval = PTR_ERR(p->mempolicy);
2011 p->mempolicy = NULL;
2012 goto bad_fork_cleanup_threadgroup_lock;
2013 }
2014#endif
2015#ifdef CONFIG_CPUSETS
2016 p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
2017 p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
2018 seqcount_init(&p->mems_allowed_seq);
2019#endif
2020#ifdef CONFIG_TRACE_IRQFLAGS
2021 p->irq_events = 0;
2022 p->hardirqs_enabled = 0;
2023 p->hardirq_enable_ip = 0;
2024 p->hardirq_enable_event = 0;
2025 p->hardirq_disable_ip = _THIS_IP_;
2026 p->hardirq_disable_event = 0;
2027 p->softirqs_enabled = 1;
2028 p->softirq_enable_ip = _THIS_IP_;
2029 p->softirq_enable_event = 0;
2030 p->softirq_disable_ip = 0;
2031 p->softirq_disable_event = 0;
2032 p->hardirq_context = 0;
2033 p->softirq_context = 0;
2034#endif
2035
2036 p->pagefault_disabled = 0;
2037
2038#ifdef CONFIG_LOCKDEP
2039 lockdep_init_task(p);
2040#endif
2041
2042#ifdef CONFIG_DEBUG_MUTEXES
2043 p->blocked_on = NULL;
2044#endif
2045#ifdef CONFIG_BCACHE
2046 p->sequential_io = 0;
2047 p->sequential_io_avg = 0;
2048#endif
2049
2050
2051 retval = sched_fork(clone_flags, p);
2052 if (retval)
2053 goto bad_fork_cleanup_policy;
2054
2055 retval = perf_event_init_task(p);
2056 if (retval)
2057 goto bad_fork_cleanup_policy;
2058 retval = audit_alloc(p);
2059 if (retval)
2060 goto bad_fork_cleanup_perf;
2061
2062 shm_init_task(p);
2063 retval = security_task_alloc(p, clone_flags);
2064 if (retval)
2065 goto bad_fork_cleanup_audit;
2066 retval = copy_semundo(clone_flags, p);
2067 if (retval)
2068 goto bad_fork_cleanup_security;
2069 retval = copy_files(clone_flags, p);
2070 if (retval)
2071 goto bad_fork_cleanup_semundo;
2072 retval = copy_fs(clone_flags, p);
2073 if (retval)
2074 goto bad_fork_cleanup_files;
2075 retval = copy_sighand(clone_flags, p);
2076 if (retval)
2077 goto bad_fork_cleanup_fs;
2078 retval = copy_signal(clone_flags, p);
2079 if (retval)
2080 goto bad_fork_cleanup_sighand;
2081 retval = copy_mm(clone_flags, p);
2082 if (retval)
2083 goto bad_fork_cleanup_signal;
2084 retval = copy_namespaces(clone_flags, p);
2085 if (retval)
2086 goto bad_fork_cleanup_mm;
2087 retval = copy_io(clone_flags, p);
2088 if (retval)
2089 goto bad_fork_cleanup_namespaces;
2090 retval = copy_thread_tls(clone_flags, args->stack, args->stack_size, p,
2091 args->tls);
2092 if (retval)
2093 goto bad_fork_cleanup_io;
2094
2095 stackleak_task_init(p);
2096
2097 if (pid != &init_struct_pid) {
2098 pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
2099 args->set_tid_size);
2100 if (IS_ERR(pid)) {
2101 retval = PTR_ERR(pid);
2102 goto bad_fork_cleanup_thread;
2103 }
2104 }
2105
2106
2107
2108
2109
2110
2111 if (clone_flags & CLONE_PIDFD) {
2112 retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
2113 if (retval < 0)
2114 goto bad_fork_free_pid;
2115
2116 pidfd = retval;
2117
2118 pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
2119 O_RDWR | O_CLOEXEC);
2120 if (IS_ERR(pidfile)) {
2121 put_unused_fd(pidfd);
2122 retval = PTR_ERR(pidfile);
2123 goto bad_fork_free_pid;
2124 }
2125 get_pid(pid);
2126
2127 retval = put_user(pidfd, args->pidfd);
2128 if (retval)
2129 goto bad_fork_put_pidfd;
2130 }
2131
2132#ifdef CONFIG_BLOCK
2133 p->plug = NULL;
2134#endif
2135 futex_init_task(p);
2136
2137
2138
2139
2140 if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
2141 sas_ss_reset(p);
2142
2143
2144
2145
2146
2147 user_disable_single_step(p);
2148 clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
2149#ifdef TIF_SYSCALL_EMU
2150 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
2151#endif
2152 clear_tsk_latency_tracing(p);
2153
2154
2155 p->pid = pid_nr(pid);
2156 if (clone_flags & CLONE_THREAD) {
2157 p->exit_signal = -1;
2158 p->group_leader = current->group_leader;
2159 p->tgid = current->tgid;
2160 } else {
2161 if (clone_flags & CLONE_PARENT)
2162 p->exit_signal = current->group_leader->exit_signal;
2163 else
2164 p->exit_signal = args->exit_signal;
2165 p->group_leader = p;
2166 p->tgid = p->pid;
2167 }
2168
2169 p->nr_dirtied = 0;
2170 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
2171 p->dirty_paused_when = 0;
2172
2173 p->pdeath_signal = 0;
2174 INIT_LIST_HEAD(&p->thread_group);
2175 p->task_works = NULL;
2176
2177 cgroup_threadgroup_change_begin(current);
2178
2179
2180
2181
2182
2183
2184 retval = cgroup_can_fork(p);
2185 if (retval)
2186 goto bad_fork_cgroup_threadgroup_change_end;
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196 p->start_time = ktime_get_ns();
2197 p->start_boottime = ktime_get_boottime_ns();
2198
2199
2200
2201
2202
2203 write_lock_irq(&tasklist_lock);
2204
2205
2206 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
2207 p->real_parent = current->real_parent;
2208 p->parent_exec_id = current->parent_exec_id;
2209 } else {
2210 p->real_parent = current;
2211 p->parent_exec_id = current->self_exec_id;
2212 }
2213
2214 klp_copy_process(p);
2215
2216 spin_lock(¤t->sighand->siglock);
2217
2218
2219
2220
2221
2222 copy_seccomp(p);
2223
2224 rseq_fork(p, clone_flags);
2225
2226
2227 if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
2228 retval = -ENOMEM;
2229 goto bad_fork_cancel_cgroup;
2230 }
2231
2232
2233 if (fatal_signal_pending(current)) {
2234 retval = -EINTR;
2235 goto bad_fork_cancel_cgroup;
2236 }
2237
2238
2239 if (pidfile)
2240 fd_install(pidfd, pidfile);
2241
2242 init_task_pid_links(p);
2243 if (likely(p->pid)) {
2244 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
2245
2246 init_task_pid(p, PIDTYPE_PID, pid);
2247 if (thread_group_leader(p)) {
2248 init_task_pid(p, PIDTYPE_TGID, pid);
2249 init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
2250 init_task_pid(p, PIDTYPE_SID, task_session(current));
2251
2252 if (is_child_reaper(pid)) {
2253 ns_of_pid(pid)->child_reaper = p;
2254 p->signal->flags |= SIGNAL_UNKILLABLE;
2255 }
2256 p->signal->shared_pending.signal = delayed.signal;
2257 p->signal->tty = tty_kref_get(current->signal->tty);
2258
2259
2260
2261
2262
2263 p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
2264 p->real_parent->signal->is_child_subreaper;
2265 list_add_tail(&p->sibling, &p->real_parent->children);
2266 list_add_tail_rcu(&p->tasks, &init_task.tasks);
2267 attach_pid(p, PIDTYPE_TGID);
2268 attach_pid(p, PIDTYPE_PGID);
2269 attach_pid(p, PIDTYPE_SID);
2270 __this_cpu_inc(process_counts);
2271 } else {
2272 current->signal->nr_threads++;
2273 atomic_inc(¤t->signal->live);
2274 refcount_inc(¤t->signal->sigcnt);
2275 task_join_group_stop(p);
2276 list_add_tail_rcu(&p->thread_group,
2277 &p->group_leader->thread_group);
2278 list_add_tail_rcu(&p->thread_node,
2279 &p->signal->thread_head);
2280 }
2281 attach_pid(p, PIDTYPE_PID);
2282 nr_threads++;
2283 }
2284 total_forks++;
2285 hlist_del_init(&delayed.node);
2286 spin_unlock(¤t->sighand->siglock);
2287 syscall_tracepoint_update(p);
2288 write_unlock_irq(&tasklist_lock);
2289
2290 proc_fork_connector(p);
2291 cgroup_post_fork(p);
2292 cgroup_threadgroup_change_end(current);
2293 perf_event_fork(p);
2294
2295 trace_task_newtask(p, clone_flags);
2296 uprobe_copy_process(p, clone_flags);
2297
2298 return p;
2299
2300bad_fork_cancel_cgroup:
2301 spin_unlock(¤t->sighand->siglock);
2302 write_unlock_irq(&tasklist_lock);
2303 cgroup_cancel_fork(p);
2304bad_fork_cgroup_threadgroup_change_end:
2305 cgroup_threadgroup_change_end(current);
2306bad_fork_put_pidfd:
2307 if (clone_flags & CLONE_PIDFD) {
2308 fput(pidfile);
2309 put_unused_fd(pidfd);
2310 }
2311bad_fork_free_pid:
2312 if (pid != &init_struct_pid)
2313 free_pid(pid);
2314bad_fork_cleanup_thread:
2315 exit_thread(p);
2316bad_fork_cleanup_io:
2317 if (p->io_context)
2318 exit_io_context(p);
2319bad_fork_cleanup_namespaces:
2320 exit_task_namespaces(p);
2321bad_fork_cleanup_mm:
2322 if (p->mm) {
2323 mm_clear_owner(p->mm, p);
2324 mmput(p->mm);
2325 }
2326bad_fork_cleanup_signal:
2327 if (!(clone_flags & CLONE_THREAD))
2328 free_signal_struct(p->signal);
2329bad_fork_cleanup_sighand:
2330 __cleanup_sighand(p->sighand);
2331bad_fork_cleanup_fs:
2332 exit_fs(p);
2333bad_fork_cleanup_files:
2334 exit_files(p);
2335bad_fork_cleanup_semundo:
2336 exit_sem(p);
2337bad_fork_cleanup_security:
2338 security_task_free(p);
2339bad_fork_cleanup_audit:
2340 audit_free(p);
2341bad_fork_cleanup_perf:
2342 perf_event_free_task(p);
2343bad_fork_cleanup_policy:
2344 lockdep_free_task(p);
2345#ifdef CONFIG_NUMA
2346 mpol_put(p->mempolicy);
2347bad_fork_cleanup_threadgroup_lock:
2348#endif
2349 delayacct_tsk_free(p);
2350bad_fork_cleanup_count:
2351 atomic_dec(&p->cred->user->processes);
2352 exit_creds(p);
2353bad_fork_free:
2354 p->state = TASK_DEAD;
2355 put_task_stack(p);
2356 delayed_free_task(p);
2357fork_out:
2358 spin_lock_irq(¤t->sighand->siglock);
2359 hlist_del_init(&delayed.node);
2360 spin_unlock_irq(¤t->sighand->siglock);
2361 return ERR_PTR(retval);
2362}
2363
2364static inline void init_idle_pids(struct task_struct *idle)
2365{
2366 enum pid_type type;
2367
2368 for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
2369 INIT_HLIST_NODE(&idle->pid_links[type]);
2370 init_task_pid(idle, type, &init_struct_pid);
2371 }
2372}
2373
2374struct task_struct *fork_idle(int cpu)
2375{
2376 struct task_struct *task;
2377 struct kernel_clone_args args = {
2378 .flags = CLONE_VM,
2379 };
2380
2381 task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
2382 if (!IS_ERR(task)) {
2383 init_idle_pids(task);
2384 init_idle(task, cpu);
2385 }
2386
2387 return task;
2388}
2389
2390struct mm_struct *copy_init_mm(void)
2391{
2392 return dup_mm(NULL, &init_mm);
2393}
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403long _do_fork(struct kernel_clone_args *args)
2404{
2405 u64 clone_flags = args->flags;
2406 struct completion vfork;
2407 struct pid *pid;
2408 struct task_struct *p;
2409 int trace = 0;
2410 long nr;
2411
2412
2413
2414
2415
2416
2417
2418 if (!(clone_flags & CLONE_UNTRACED)) {
2419 if (clone_flags & CLONE_VFORK)
2420 trace = PTRACE_EVENT_VFORK;
2421 else if (args->exit_signal != SIGCHLD)
2422 trace = PTRACE_EVENT_CLONE;
2423 else
2424 trace = PTRACE_EVENT_FORK;
2425
2426 if (likely(!ptrace_event_enabled(current, trace)))
2427 trace = 0;
2428 }
2429
2430 p = copy_process(NULL, trace, NUMA_NO_NODE, args);
2431 add_latent_entropy();
2432
2433 if (IS_ERR(p))
2434 return PTR_ERR(p);
2435
2436
2437
2438
2439
2440 trace_sched_process_fork(current, p);
2441
2442 pid = get_task_pid(p, PIDTYPE_PID);
2443 nr = pid_vnr(pid);
2444
2445 if (clone_flags & CLONE_PARENT_SETTID)
2446 put_user(nr, args->parent_tid);
2447
2448 if (clone_flags & CLONE_VFORK) {
2449 p->vfork_done = &vfork;
2450 init_completion(&vfork);
2451 get_task_struct(p);
2452 }
2453
2454 wake_up_new_task(p);
2455
2456
2457 if (unlikely(trace))
2458 ptrace_event_pid(trace, pid);
2459
2460 if (clone_flags & CLONE_VFORK) {
2461 if (!wait_for_vfork_done(p, &vfork))
2462 ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
2463 }
2464
2465 put_pid(pid);
2466 return nr;
2467}
2468
2469bool legacy_clone_args_valid(const struct kernel_clone_args *kargs)
2470{
2471
2472 if ((kargs->flags & CLONE_PIDFD) &&
2473 (kargs->flags & CLONE_PARENT_SETTID))
2474 return false;
2475
2476 return true;
2477}
2478
2479#ifndef CONFIG_HAVE_COPY_THREAD_TLS
2480
2481
2482long do_fork(unsigned long clone_flags,
2483 unsigned long stack_start,
2484 unsigned long stack_size,
2485 int __user *parent_tidptr,
2486 int __user *child_tidptr)
2487{
2488 struct kernel_clone_args args = {
2489 .flags = (clone_flags & ~CSIGNAL),
2490 .pidfd = parent_tidptr,
2491 .child_tid = child_tidptr,
2492 .parent_tid = parent_tidptr,
2493 .exit_signal = (clone_flags & CSIGNAL),
2494 .stack = stack_start,
2495 .stack_size = stack_size,
2496 };
2497
2498 if (!legacy_clone_args_valid(&args))
2499 return -EINVAL;
2500
2501 return _do_fork(&args);
2502}
2503#endif
2504
2505
2506
2507
2508pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
2509{
2510 struct kernel_clone_args args = {
2511 .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
2512 .exit_signal = (flags & CSIGNAL),
2513 .stack = (unsigned long)fn,
2514 .stack_size = (unsigned long)arg,
2515 };
2516
2517 return _do_fork(&args);
2518}
2519
2520#ifdef __ARCH_WANT_SYS_FORK
2521SYSCALL_DEFINE0(fork)
2522{
2523#ifdef CONFIG_MMU
2524 struct kernel_clone_args args = {
2525 .exit_signal = SIGCHLD,
2526 };
2527
2528 return _do_fork(&args);
2529#else
2530
2531 return -EINVAL;
2532#endif
2533}
2534#endif
2535
2536#ifdef __ARCH_WANT_SYS_VFORK
2537SYSCALL_DEFINE0(vfork)
2538{
2539 struct kernel_clone_args args = {
2540 .flags = CLONE_VFORK | CLONE_VM,
2541 .exit_signal = SIGCHLD,
2542 };
2543
2544 return _do_fork(&args);
2545}
2546#endif
2547
2548#ifdef __ARCH_WANT_SYS_CLONE
2549#ifdef CONFIG_CLONE_BACKWARDS
2550SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
2551 int __user *, parent_tidptr,
2552 unsigned long, tls,
2553 int __user *, child_tidptr)
2554#elif defined(CONFIG_CLONE_BACKWARDS2)
2555SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
2556 int __user *, parent_tidptr,
2557 int __user *, child_tidptr,
2558 unsigned long, tls)
2559#elif defined(CONFIG_CLONE_BACKWARDS3)
2560SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
2561 int, stack_size,
2562 int __user *, parent_tidptr,
2563 int __user *, child_tidptr,
2564 unsigned long, tls)
2565#else
2566SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
2567 int __user *, parent_tidptr,
2568 int __user *, child_tidptr,
2569 unsigned long, tls)
2570#endif
2571{
2572 struct kernel_clone_args args = {
2573 .flags = (clone_flags & ~CSIGNAL),
2574 .pidfd = parent_tidptr,
2575 .child_tid = child_tidptr,
2576 .parent_tid = parent_tidptr,
2577 .exit_signal = (clone_flags & CSIGNAL),
2578 .stack = newsp,
2579 .tls = tls,
2580 };
2581
2582 if (!legacy_clone_args_valid(&args))
2583 return -EINVAL;
2584
2585 return _do_fork(&args);
2586}
2587#endif
2588
2589#ifdef __ARCH_WANT_SYS_CLONE3
2590
2591
2592
2593
2594
2595
2596#ifndef CONFIG_HAVE_COPY_THREAD_TLS
2597#error clone3 requires copy_thread_tls support in arch
2598#endif
2599
2600noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
2601 struct clone_args __user *uargs,
2602 size_t usize)
2603{
2604 int err;
2605 struct clone_args args;
2606 pid_t *kset_tid = kargs->set_tid;
2607
2608 if (unlikely(usize > PAGE_SIZE))
2609 return -E2BIG;
2610 if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
2611 return -EINVAL;
2612
2613 err = copy_struct_from_user(&args, sizeof(args), uargs, usize);
2614 if (err)
2615 return err;
2616
2617 if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
2618 return -EINVAL;
2619
2620 if (unlikely(!args.set_tid && args.set_tid_size > 0))
2621 return -EINVAL;
2622
2623 if (unlikely(args.set_tid && args.set_tid_size == 0))
2624 return -EINVAL;
2625
2626
2627
2628
2629
2630 if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
2631 !valid_signal(args.exit_signal)))
2632 return -EINVAL;
2633
2634 *kargs = (struct kernel_clone_args){
2635 .flags = args.flags,
2636 .pidfd = u64_to_user_ptr(args.pidfd),
2637 .child_tid = u64_to_user_ptr(args.child_tid),
2638 .parent_tid = u64_to_user_ptr(args.parent_tid),
2639 .exit_signal = args.exit_signal,
2640 .stack = args.stack,
2641 .stack_size = args.stack_size,
2642 .tls = args.tls,
2643 .set_tid_size = args.set_tid_size,
2644 };
2645
2646 if (args.set_tid &&
2647 copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
2648 (kargs->set_tid_size * sizeof(pid_t))))
2649 return -EFAULT;
2650
2651 kargs->set_tid = kset_tid;
2652
2653 return 0;
2654}
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
2665{
2666 if (kargs->stack == 0) {
2667 if (kargs->stack_size > 0)
2668 return false;
2669 } else {
2670 if (kargs->stack_size == 0)
2671 return false;
2672
2673 if (!access_ok((void __user *)kargs->stack, kargs->stack_size))
2674 return false;
2675
2676#if !defined(CONFIG_STACK_GROWSUP) && !defined(CONFIG_IA64)
2677 kargs->stack += kargs->stack_size;
2678#endif
2679 }
2680
2681 return true;
2682}
2683
2684static bool clone3_args_valid(struct kernel_clone_args *kargs)
2685{
2686
2687 if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND))
2688 return false;
2689
2690
2691
2692
2693
2694 if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
2695 return false;
2696
2697 if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) ==
2698 (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND))
2699 return false;
2700
2701 if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
2702 kargs->exit_signal)
2703 return false;
2704
2705 if (!clone3_stack_valid(kargs))
2706 return false;
2707
2708 return true;
2709}
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
2723{
2724 int err;
2725
2726 struct kernel_clone_args kargs;
2727 pid_t set_tid[MAX_PID_NS_LEVEL];
2728
2729 kargs.set_tid = set_tid;
2730
2731 err = copy_clone_args_from_user(&kargs, uargs, size);
2732 if (err)
2733 return err;
2734
2735 if (!clone3_args_valid(&kargs))
2736 return -EINVAL;
2737
2738 return _do_fork(&kargs);
2739}
2740#endif
2741
2742void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data)
2743{
2744 struct task_struct *leader, *parent, *child;
2745 int res;
2746
2747 read_lock(&tasklist_lock);
2748 leader = top = top->group_leader;
2749down:
2750 for_each_thread(leader, parent) {
2751 list_for_each_entry(child, &parent->children, sibling) {
2752 res = visitor(child, data);
2753 if (res) {
2754 if (res < 0)
2755 goto out;
2756 leader = child;
2757 goto down;
2758 }
2759up:
2760 ;
2761 }
2762 }
2763
2764 if (leader != top) {
2765 child = leader;
2766 parent = child->real_parent;
2767 leader = parent->group_leader;
2768 goto up;
2769 }
2770out:
2771 read_unlock(&tasklist_lock);
2772}
2773
2774#ifndef ARCH_MIN_MMSTRUCT_ALIGN
2775#define ARCH_MIN_MMSTRUCT_ALIGN 0
2776#endif
2777
2778static void sighand_ctor(void *data)
2779{
2780 struct sighand_struct *sighand = data;
2781
2782 spin_lock_init(&sighand->siglock);
2783 init_waitqueue_head(&sighand->signalfd_wqh);
2784}
2785
2786void __init proc_caches_init(void)
2787{
2788 unsigned int mm_size;
2789
2790 sighand_cachep = kmem_cache_create("sighand_cache",
2791 sizeof(struct sighand_struct), 0,
2792 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
2793 SLAB_ACCOUNT, sighand_ctor);
2794 signal_cachep = kmem_cache_create("signal_cache",
2795 sizeof(struct signal_struct), 0,
2796 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
2797 NULL);
2798 files_cachep = kmem_cache_create("files_cache",
2799 sizeof(struct files_struct), 0,
2800 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
2801 NULL);
2802 fs_cachep = kmem_cache_create("fs_cache",
2803 sizeof(struct fs_struct), 0,
2804 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
2805 NULL);
2806
2807
2808
2809
2810
2811
2812 mm_size = sizeof(struct mm_struct) + cpumask_size();
2813
2814 mm_cachep = kmem_cache_create_usercopy("mm_struct",
2815 mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
2816 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
2817 offsetof(struct mm_struct, saved_auxv),
2818 sizeof_field(struct mm_struct, saved_auxv),
2819 NULL);
2820 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
2821 mmap_init();
2822 nsproxy_cache_init();
2823}
2824
2825
2826
2827
2828static int check_unshare_flags(unsigned long unshare_flags)
2829{
2830 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
2831 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
2832 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
2833 CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
2834 CLONE_NEWTIME))
2835 return -EINVAL;
2836
2837
2838
2839
2840
2841
2842 if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
2843 if (!thread_group_empty(current))
2844 return -EINVAL;
2845 }
2846 if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
2847 if (refcount_read(¤t->sighand->count) > 1)
2848 return -EINVAL;
2849 }
2850 if (unshare_flags & CLONE_VM) {
2851 if (!current_is_single_threaded())
2852 return -EINVAL;
2853 }
2854
2855 return 0;
2856}
2857
2858
2859
2860
2861static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
2862{
2863 struct fs_struct *fs = current->fs;
2864
2865 if (!(unshare_flags & CLONE_FS) || !fs)
2866 return 0;
2867
2868
2869 if (fs->users == 1)
2870 return 0;
2871
2872 *new_fsp = copy_fs_struct(fs);
2873 if (!*new_fsp)
2874 return -ENOMEM;
2875
2876 return 0;
2877}
2878
2879
2880
2881
2882static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
2883{
2884 struct files_struct *fd = current->files;
2885 int error = 0;
2886
2887 if ((unshare_flags & CLONE_FILES) &&
2888 (fd && atomic_read(&fd->count) > 1)) {
2889 *new_fdp = dup_fd(fd, &error);
2890 if (!*new_fdp)
2891 return error;
2892 }
2893
2894 return 0;
2895}
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905int ksys_unshare(unsigned long unshare_flags)
2906{
2907 struct fs_struct *fs, *new_fs = NULL;
2908 struct files_struct *fd, *new_fd = NULL;
2909 struct cred *new_cred = NULL;
2910 struct nsproxy *new_nsproxy = NULL;
2911 int do_sysvsem = 0;
2912 int err;
2913
2914
2915
2916
2917
2918 if (unshare_flags & CLONE_NEWUSER)
2919 unshare_flags |= CLONE_THREAD | CLONE_FS;
2920
2921
2922
2923 if (unshare_flags & CLONE_VM)
2924 unshare_flags |= CLONE_SIGHAND;
2925
2926
2927
2928 if (unshare_flags & CLONE_SIGHAND)
2929 unshare_flags |= CLONE_THREAD;
2930
2931
2932
2933 if (unshare_flags & CLONE_NEWNS)
2934 unshare_flags |= CLONE_FS;
2935
2936 err = check_unshare_flags(unshare_flags);
2937 if (err)
2938 goto bad_unshare_out;
2939
2940
2941
2942
2943
2944 if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
2945 do_sysvsem = 1;
2946 err = unshare_fs(unshare_flags, &new_fs);
2947 if (err)
2948 goto bad_unshare_out;
2949 err = unshare_fd(unshare_flags, &new_fd);
2950 if (err)
2951 goto bad_unshare_cleanup_fs;
2952 err = unshare_userns(unshare_flags, &new_cred);
2953 if (err)
2954 goto bad_unshare_cleanup_fd;
2955 err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
2956 new_cred, new_fs);
2957 if (err)
2958 goto bad_unshare_cleanup_cred;
2959
2960 if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
2961 if (do_sysvsem) {
2962
2963
2964
2965 exit_sem(current);
2966 }
2967 if (unshare_flags & CLONE_NEWIPC) {
2968
2969 exit_shm(current);
2970 shm_init_task(current);
2971 }
2972
2973 if (new_nsproxy)
2974 switch_task_namespaces(current, new_nsproxy);
2975
2976 task_lock(current);
2977
2978 if (new_fs) {
2979 fs = current->fs;
2980 spin_lock(&fs->lock);
2981 current->fs = new_fs;
2982 if (--fs->users)
2983 new_fs = NULL;
2984 else
2985 new_fs = fs;
2986 spin_unlock(&fs->lock);
2987 }
2988
2989 if (new_fd) {
2990 fd = current->files;
2991 current->files = new_fd;
2992 new_fd = fd;
2993 }
2994
2995 task_unlock(current);
2996
2997 if (new_cred) {
2998
2999 commit_creds(new_cred);
3000 new_cred = NULL;
3001 }
3002 }
3003
3004 perf_event_namespaces(current);
3005
3006bad_unshare_cleanup_cred:
3007 if (new_cred)
3008 put_cred(new_cred);
3009bad_unshare_cleanup_fd:
3010 if (new_fd)
3011 put_files_struct(new_fd);
3012
3013bad_unshare_cleanup_fs:
3014 if (new_fs)
3015 free_fs_struct(new_fs);
3016
3017bad_unshare_out:
3018 return err;
3019}
3020
3021SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
3022{
3023 return ksys_unshare(unshare_flags);
3024}
3025
3026
3027
3028
3029
3030
3031
3032int unshare_files(struct files_struct **displaced)
3033{
3034 struct task_struct *task = current;
3035 struct files_struct *copy = NULL;
3036 int error;
3037
3038 error = unshare_fd(CLONE_FILES, ©);
3039 if (error || !copy) {
3040 *displaced = NULL;
3041 return error;
3042 }
3043 *displaced = task->files;
3044 task_lock(task);
3045 task->files = copy;
3046 task_unlock(task);
3047 return 0;
3048}
3049
3050int sysctl_max_threads(struct ctl_table *table, int write,
3051 void __user *buffer, size_t *lenp, loff_t *ppos)
3052{
3053 struct ctl_table t;
3054 int ret;
3055 int threads = max_threads;
3056 int min = 1;
3057 int max = MAX_THREADS;
3058
3059 t = *table;
3060 t.data = &threads;
3061 t.extra1 = &min;
3062 t.extra2 = &max;
3063
3064 ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
3065 if (ret || !write)
3066 return ret;
3067
3068 max_threads = threads;
3069
3070 return 0;
3071}
3072