1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18#include <linux/cpu.h>
19#include <linux/errno.h>
20#include <linux/sched.h>
21#include <linux/sched/task.h>
22#include <linux/sched/task_stack.h>
23#include <linux/fs.h>
24#include <linux/kernel.h>
25#include <linux/mm.h>
26#include <linux/elfcore.h>
27#include <linux/smp.h>
28#include <linux/slab.h>
29#include <linux/user.h>
30#include <linux/interrupt.h>
31#include <linux/delay.h>
32#include <linux/export.h>
33#include <linux/ptrace.h>
34#include <linux/notifier.h>
35#include <linux/kprobes.h>
36#include <linux/kdebug.h>
37#include <linux/prctl.h>
38#include <linux/uaccess.h>
39#include <linux/io.h>
40#include <linux/ftrace.h>
41#include <linux/syscalls.h>
42
43#include <asm/processor.h>
44#include <asm/pkru.h>
45#include <asm/fpu/internal.h>
46#include <asm/mmu_context.h>
47#include <asm/prctl.h>
48#include <asm/desc.h>
49#include <asm/proto.h>
50#include <asm/ia32.h>
51#include <asm/debugreg.h>
52#include <asm/switch_to.h>
53#include <asm/xen/hypervisor.h>
54#include <asm/vdso.h>
55#include <asm/resctrl.h>
56#include <asm/unistd.h>
57#include <asm/fsgsbase.h>
58#ifdef CONFIG_IA32_EMULATION
59
60#include <asm/unistd_32_ia32.h>
61#endif
62
63#include "process.h"
64
65
66void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
67 const char *log_lvl)
68{
69 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
70 unsigned long d0, d1, d2, d3, d6, d7;
71 unsigned int fsindex, gsindex;
72 unsigned int ds, es;
73
74 show_iret_regs(regs, log_lvl);
75
76 if (regs->orig_ax != -1)
77 pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
78 else
79 pr_cont("\n");
80
81 printk("%sRAX: %016lx RBX: %016lx RCX: %016lx\n",
82 log_lvl, regs->ax, regs->bx, regs->cx);
83 printk("%sRDX: %016lx RSI: %016lx RDI: %016lx\n",
84 log_lvl, regs->dx, regs->si, regs->di);
85 printk("%sRBP: %016lx R08: %016lx R09: %016lx\n",
86 log_lvl, regs->bp, regs->r8, regs->r9);
87 printk("%sR10: %016lx R11: %016lx R12: %016lx\n",
88 log_lvl, regs->r10, regs->r11, regs->r12);
89 printk("%sR13: %016lx R14: %016lx R15: %016lx\n",
90 log_lvl, regs->r13, regs->r14, regs->r15);
91
92 if (mode == SHOW_REGS_SHORT)
93 return;
94
95 if (mode == SHOW_REGS_USER) {
96 rdmsrl(MSR_FS_BASE, fs);
97 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
98 printk("%sFS: %016lx GS: %016lx\n",
99 log_lvl, fs, shadowgs);
100 return;
101 }
102
103 asm("movl %%ds,%0" : "=r" (ds));
104 asm("movl %%es,%0" : "=r" (es));
105 asm("movl %%fs,%0" : "=r" (fsindex));
106 asm("movl %%gs,%0" : "=r" (gsindex));
107
108 rdmsrl(MSR_FS_BASE, fs);
109 rdmsrl(MSR_GS_BASE, gs);
110 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
111
112 cr0 = read_cr0();
113 cr2 = read_cr2();
114 cr3 = __read_cr3();
115 cr4 = __read_cr4();
116
117 printk("%sFS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
118 log_lvl, fs, fsindex, gs, gsindex, shadowgs);
119 printk("%sCS: %04lx DS: %04x ES: %04x CR0: %016lx\n",
120 log_lvl, regs->cs, ds, es, cr0);
121 printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n",
122 log_lvl, cr2, cr3, cr4);
123
124 get_debugreg(d0, 0);
125 get_debugreg(d1, 1);
126 get_debugreg(d2, 2);
127 get_debugreg(d3, 3);
128 get_debugreg(d6, 6);
129 get_debugreg(d7, 7);
130
131
132 if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
133 (d6 == DR6_RESERVED) && (d7 == 0x400))) {
134 printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n",
135 log_lvl, d0, d1, d2);
136 printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n",
137 log_lvl, d3, d6, d7);
138 }
139
140 if (cpu_feature_enabled(X86_FEATURE_OSPKE))
141 printk("%sPKRU: %08x\n", log_lvl, read_pkru());
142}
143
144void release_thread(struct task_struct *dead_task)
145{
146 WARN_ON(dead_task->mm);
147}
148
149enum which_selector {
150 FS,
151 GS
152};
153
154
155
156
157
158
159
160
161
162static noinstr unsigned long __rdgsbase_inactive(void)
163{
164 unsigned long gsbase;
165
166 lockdep_assert_irqs_disabled();
167
168 if (!static_cpu_has(X86_FEATURE_XENPV)) {
169 native_swapgs();
170 gsbase = rdgsbase();
171 native_swapgs();
172 } else {
173 instrumentation_begin();
174 rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
175 instrumentation_end();
176 }
177
178 return gsbase;
179}
180
181
182
183
184
185
186
187
188
189static noinstr void __wrgsbase_inactive(unsigned long gsbase)
190{
191 lockdep_assert_irqs_disabled();
192
193 if (!static_cpu_has(X86_FEATURE_XENPV)) {
194 native_swapgs();
195 wrgsbase(gsbase);
196 native_swapgs();
197 } else {
198 instrumentation_begin();
199 wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
200 instrumentation_end();
201 }
202}
203
204
205
206
207
208
209
210static __always_inline void save_base_legacy(struct task_struct *prev_p,
211 unsigned short selector,
212 enum which_selector which)
213{
214 if (likely(selector == 0)) {
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231 } else {
232
233
234
235
236
237
238
239
240
241
242 if (which == FS)
243 prev_p->thread.fsbase = 0;
244 else
245 prev_p->thread.gsbase = 0;
246 }
247}
248
249static __always_inline void save_fsgs(struct task_struct *task)
250{
251 savesegment(fs, task->thread.fsindex);
252 savesegment(gs, task->thread.gsindex);
253 if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
254
255
256
257
258
259 task->thread.fsbase = rdfsbase();
260 task->thread.gsbase = __rdgsbase_inactive();
261 } else {
262 save_base_legacy(task, task->thread.fsindex, FS);
263 save_base_legacy(task, task->thread.gsindex, GS);
264 }
265}
266
267
268
269
270
271void current_save_fsgs(void)
272{
273 unsigned long flags;
274
275
276 local_irq_save(flags);
277 save_fsgs(current);
278 local_irq_restore(flags);
279}
280#if IS_ENABLED(CONFIG_KVM)
281EXPORT_SYMBOL_GPL(current_save_fsgs);
282#endif
283
284static __always_inline void loadseg(enum which_selector which,
285 unsigned short sel)
286{
287 if (which == FS)
288 loadsegment(fs, sel);
289 else
290 load_gs_index(sel);
291}
292
293static __always_inline void load_seg_legacy(unsigned short prev_index,
294 unsigned long prev_base,
295 unsigned short next_index,
296 unsigned long next_base,
297 enum which_selector which)
298{
299 if (likely(next_index <= 3)) {
300
301
302
303
304 if (next_base == 0) {
305
306
307
308
309 if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
310 loadseg(which, __USER_DS);
311 loadseg(which, next_index);
312 } else {
313
314
315
316
317
318
319
320
321
322
323
324
325 if (likely(prev_index | next_index | prev_base))
326 loadseg(which, next_index);
327 }
328 } else {
329 if (prev_index != next_index)
330 loadseg(which, next_index);
331 wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
332 next_base);
333 }
334 } else {
335
336
337
338
339 loadseg(which, next_index);
340 }
341}
342
343
344
345
346
347
348
349static __always_inline void x86_pkru_load(struct thread_struct *prev,
350 struct thread_struct *next)
351{
352 if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
353 return;
354
355
356 prev->pkru = rdpkru();
357
358
359
360
361
362 if (prev->pkru != next->pkru)
363 wrpkru(next->pkru);
364}
365
366static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
367 struct thread_struct *next)
368{
369 if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
370
371 if (unlikely(prev->fsindex || next->fsindex))
372 loadseg(FS, next->fsindex);
373 if (unlikely(prev->gsindex || next->gsindex))
374 loadseg(GS, next->gsindex);
375
376
377 wrfsbase(next->fsbase);
378 __wrgsbase_inactive(next->gsbase);
379 } else {
380 load_seg_legacy(prev->fsindex, prev->fsbase,
381 next->fsindex, next->fsbase, FS);
382 load_seg_legacy(prev->gsindex, prev->gsbase,
383 next->gsindex, next->gsbase, GS);
384 }
385}
386
387unsigned long x86_fsgsbase_read_task(struct task_struct *task,
388 unsigned short selector)
389{
390 unsigned short idx = selector >> 3;
391 unsigned long base;
392
393 if (likely((selector & SEGMENT_TI_MASK) == 0)) {
394 if (unlikely(idx >= GDT_ENTRIES))
395 return 0;
396
397
398
399
400
401 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
402 return 0;
403
404 idx -= GDT_ENTRY_TLS_MIN;
405 base = get_desc_base(&task->thread.tls_array[idx]);
406 } else {
407#ifdef CONFIG_MODIFY_LDT_SYSCALL
408 struct ldt_struct *ldt;
409
410
411
412
413
414
415 mutex_lock(&task->mm->context.lock);
416 ldt = task->mm->context.ldt;
417 if (unlikely(!ldt || idx >= ldt->nr_entries))
418 base = 0;
419 else
420 base = get_desc_base(ldt->entries + idx);
421 mutex_unlock(&task->mm->context.lock);
422#else
423 base = 0;
424#endif
425 }
426
427 return base;
428}
429
430unsigned long x86_gsbase_read_cpu_inactive(void)
431{
432 unsigned long gsbase;
433
434 if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
435 unsigned long flags;
436
437 local_irq_save(flags);
438 gsbase = __rdgsbase_inactive();
439 local_irq_restore(flags);
440 } else {
441 rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
442 }
443
444 return gsbase;
445}
446
447void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
448{
449 if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
450 unsigned long flags;
451
452 local_irq_save(flags);
453 __wrgsbase_inactive(gsbase);
454 local_irq_restore(flags);
455 } else {
456 wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
457 }
458}
459
460unsigned long x86_fsbase_read_task(struct task_struct *task)
461{
462 unsigned long fsbase;
463
464 if (task == current)
465 fsbase = x86_fsbase_read_cpu();
466 else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
467 (task->thread.fsindex == 0))
468 fsbase = task->thread.fsbase;
469 else
470 fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
471
472 return fsbase;
473}
474
475unsigned long x86_gsbase_read_task(struct task_struct *task)
476{
477 unsigned long gsbase;
478
479 if (task == current)
480 gsbase = x86_gsbase_read_cpu_inactive();
481 else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
482 (task->thread.gsindex == 0))
483 gsbase = task->thread.gsbase;
484 else
485 gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
486
487 return gsbase;
488}
489
490void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
491{
492 WARN_ON_ONCE(task == current);
493
494 task->thread.fsbase = fsbase;
495}
496
497void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
498{
499 WARN_ON_ONCE(task == current);
500
501 task->thread.gsbase = gsbase;
502}
503
504static void
505start_thread_common(struct pt_regs *regs, unsigned long new_ip,
506 unsigned long new_sp,
507 unsigned int _cs, unsigned int _ss, unsigned int _ds)
508{
509 WARN_ON_ONCE(regs != current_pt_regs());
510
511 if (static_cpu_has(X86_BUG_NULL_SEG)) {
512
513 loadsegment(fs, __USER_DS);
514 load_gs_index(__USER_DS);
515 }
516
517 loadsegment(fs, 0);
518 loadsegment(es, _ds);
519 loadsegment(ds, _ds);
520 load_gs_index(0);
521
522 regs->ip = new_ip;
523 regs->sp = new_sp;
524 regs->cs = _cs;
525 regs->ss = _ss;
526 regs->flags = X86_EFLAGS_IF;
527}
528
529void
530start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
531{
532 start_thread_common(regs, new_ip, new_sp,
533 __USER_CS, __USER_DS, 0);
534}
535EXPORT_SYMBOL_GPL(start_thread);
536
537#ifdef CONFIG_COMPAT
538void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32)
539{
540 start_thread_common(regs, new_ip, new_sp,
541 x32 ? __USER_CS : __USER32_CS,
542 __USER_DS, __USER_DS);
543}
544#endif
545
546
547
548
549
550
551
552
553
554
555
556__visible __notrace_funcgraph struct task_struct *
557__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
558{
559 struct thread_struct *prev = &prev_p->thread;
560 struct thread_struct *next = &next_p->thread;
561 struct fpu *prev_fpu = &prev->fpu;
562 struct fpu *next_fpu = &next->fpu;
563 int cpu = smp_processor_id();
564
565 WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
566 this_cpu_read(hardirq_stack_inuse));
567
568 if (!test_thread_flag(TIF_NEED_FPU_LOAD))
569 switch_fpu_prepare(prev_fpu, cpu);
570
571
572
573
574
575
576 save_fsgs(prev_p);
577
578
579
580
581
582 load_TLS(next, cpu);
583
584
585
586
587
588
589 arch_end_context_switch(next_p);
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605 savesegment(es, prev->es);
606 if (unlikely(next->es | prev->es))
607 loadsegment(es, next->es);
608
609 savesegment(ds, prev->ds);
610 if (unlikely(next->ds | prev->ds))
611 loadsegment(ds, next->ds);
612
613 x86_fsgsbase_load(prev, next);
614
615 x86_pkru_load(prev, next);
616
617
618
619
620 this_cpu_write(current_task, next_p);
621 this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
622
623 switch_fpu_finish(next_fpu);
624
625
626 update_task_stack(next_p);
627
628 switch_to_extra(prev_p, next_p);
629
630 if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652 unsigned short ss_sel;
653 savesegment(ss, ss_sel);
654 if (ss_sel != __KERNEL_DS)
655 loadsegment(ss, __KERNEL_DS);
656 }
657
658
659 resctrl_sched_in();
660
661 return prev_p;
662}
663
664void set_personality_64bit(void)
665{
666
667
668
669 clear_thread_flag(TIF_ADDR32);
670
671 task_pt_regs(current)->orig_ax = __NR_execve;
672 current_thread_info()->status &= ~TS_COMPAT;
673 if (current->mm)
674 current->mm->context.flags = MM_CONTEXT_HAS_VSYSCALL;
675
676
677
678
679
680 current->personality &= ~READ_IMPLIES_EXEC;
681}
682
683static void __set_personality_x32(void)
684{
685#ifdef CONFIG_X86_X32
686 if (current->mm)
687 current->mm->context.flags = 0;
688
689 current->personality &= ~READ_IMPLIES_EXEC;
690
691
692
693
694
695
696
697
698 task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
699 current_thread_info()->status &= ~TS_COMPAT;
700#endif
701}
702
703static void __set_personality_ia32(void)
704{
705#ifdef CONFIG_IA32_EMULATION
706 if (current->mm) {
707
708
709
710
711 current->mm->context.flags = MM_CONTEXT_UPROBE_IA32;
712 }
713
714 current->personality |= force_personality32;
715
716 task_pt_regs(current)->orig_ax = __NR_ia32_execve;
717 current_thread_info()->status |= TS_COMPAT;
718#endif
719}
720
721void set_personality_ia32(bool x32)
722{
723
724 set_thread_flag(TIF_ADDR32);
725
726 if (x32)
727 __set_personality_x32();
728 else
729 __set_personality_ia32();
730}
731EXPORT_SYMBOL_GPL(set_personality_ia32);
732
733#ifdef CONFIG_CHECKPOINT_RESTORE
734static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
735{
736 int ret;
737
738 ret = map_vdso_once(image, addr);
739 if (ret)
740 return ret;
741
742 return (long)image->size;
743}
744#endif
745
746long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
747{
748 int ret = 0;
749
750 switch (option) {
751 case ARCH_SET_GS: {
752 if (unlikely(arg2 >= TASK_SIZE_MAX))
753 return -EPERM;
754
755 preempt_disable();
756
757
758
759
760
761
762 if (task == current) {
763 loadseg(GS, 0);
764 x86_gsbase_write_cpu_inactive(arg2);
765
766
767
768
769
770 task->thread.gsbase = arg2;
771
772 } else {
773 task->thread.gsindex = 0;
774 x86_gsbase_write_task(task, arg2);
775 }
776 preempt_enable();
777 break;
778 }
779 case ARCH_SET_FS: {
780
781
782
783
784 if (unlikely(arg2 >= TASK_SIZE_MAX))
785 return -EPERM;
786
787 preempt_disable();
788
789
790
791
792 if (task == current) {
793 loadseg(FS, 0);
794 x86_fsbase_write_cpu(arg2);
795
796
797
798
799
800 task->thread.fsbase = arg2;
801 } else {
802 task->thread.fsindex = 0;
803 x86_fsbase_write_task(task, arg2);
804 }
805 preempt_enable();
806 break;
807 }
808 case ARCH_GET_FS: {
809 unsigned long base = x86_fsbase_read_task(task);
810
811 ret = put_user(base, (unsigned long __user *)arg2);
812 break;
813 }
814 case ARCH_GET_GS: {
815 unsigned long base = x86_gsbase_read_task(task);
816
817 ret = put_user(base, (unsigned long __user *)arg2);
818 break;
819 }
820
821#ifdef CONFIG_CHECKPOINT_RESTORE
822# ifdef CONFIG_X86_X32_ABI
823 case ARCH_MAP_VDSO_X32:
824 return prctl_map_vdso(&vdso_image_x32, arg2);
825# endif
826# if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
827 case ARCH_MAP_VDSO_32:
828 return prctl_map_vdso(&vdso_image_32, arg2);
829# endif
830 case ARCH_MAP_VDSO_64:
831 return prctl_map_vdso(&vdso_image_64, arg2);
832#endif
833
834 default:
835 ret = -EINVAL;
836 break;
837 }
838
839 return ret;
840}
841
842SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
843{
844 long ret;
845
846 ret = do_arch_prctl_64(current, option, arg2);
847 if (ret == -EINVAL)
848 ret = do_arch_prctl_common(current, option, arg2);
849
850 return ret;
851}
852
853#ifdef CONFIG_IA32_EMULATION
854COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
855{
856 return do_arch_prctl_common(current, option, arg2);
857}
858#endif
859
860unsigned long KSTK_ESP(struct task_struct *task)
861{
862 return task_pt_regs(task)->sp;
863}
864