linux/arch/x86/kernel/hw_breakpoint.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 *
   4 * Copyright (C) 2007 Alan Stern
   5 * Copyright (C) 2009 IBM Corporation
   6 * Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com>
   7 *
   8 * Authors: Alan Stern <stern@rowland.harvard.edu>
   9 *          K.Prasad <prasad@linux.vnet.ibm.com>
  10 *          Frederic Weisbecker <fweisbec@gmail.com>
  11 */
  12
  13/*
  14 * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
  15 * using the CPU's debug registers.
  16 */
  17
  18#include <linux/perf_event.h>
  19#include <linux/hw_breakpoint.h>
  20#include <linux/irqflags.h>
  21#include <linux/notifier.h>
  22#include <linux/kallsyms.h>
  23#include <linux/kprobes.h>
  24#include <linux/percpu.h>
  25#include <linux/kdebug.h>
  26#include <linux/kernel.h>
  27#include <linux/export.h>
  28#include <linux/sched.h>
  29#include <linux/smp.h>
  30
  31#include <asm/hw_breakpoint.h>
  32#include <asm/processor.h>
  33#include <asm/debugreg.h>
  34#include <asm/user.h>
  35#include <asm/desc.h>
  36#include <asm/tlbflush.h>
  37
  38/* Per cpu debug control register value */
  39DEFINE_PER_CPU(unsigned long, cpu_dr7);
  40EXPORT_PER_CPU_SYMBOL(cpu_dr7);
  41
  42/* Per cpu debug address registers values */
  43static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]);
  44
  45/*
  46 * Stores the breakpoints currently in use on each breakpoint address
  47 * register for each cpus
  48 */
  49static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]);
  50
  51
  52static inline unsigned long
  53__encode_dr7(int drnum, unsigned int len, unsigned int type)
  54{
  55        unsigned long bp_info;
  56
  57        bp_info = (len | type) & 0xf;
  58        bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
  59        bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE));
  60
  61        return bp_info;
  62}
  63
  64/*
  65 * Encode the length, type, Exact, and Enable bits for a particular breakpoint
  66 * as stored in debug register 7.
  67 */
  68unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type)
  69{
  70        return __encode_dr7(drnum, len, type) | DR_GLOBAL_SLOWDOWN;
  71}
  72
  73/*
  74 * Decode the length and type bits for a particular breakpoint as
  75 * stored in debug register 7.  Return the "enabled" status.
  76 */
  77int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type)
  78{
  79        int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE);
  80
  81        *len = (bp_info & 0xc) | 0x40;
  82        *type = (bp_info & 0x3) | 0x80;
  83
  84        return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3;
  85}
  86
  87/*
  88 * Install a perf counter breakpoint.
  89 *
  90 * We seek a free debug address register and use it for this
  91 * breakpoint. Eventually we enable it in the debug control register.
  92 *
  93 * Atomic: we hold the counter->ctx->lock and we only handle variables
  94 * and registers local to this cpu.
  95 */
  96int arch_install_hw_breakpoint(struct perf_event *bp)
  97{
  98        struct arch_hw_breakpoint *info = counter_arch_bp(bp);
  99        unsigned long *dr7;
 100        int i;
 101
 102        lockdep_assert_irqs_disabled();
 103
 104        for (i = 0; i < HBP_NUM; i++) {
 105                struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]);
 106
 107                if (!*slot) {
 108                        *slot = bp;
 109                        break;
 110                }
 111        }
 112
 113        if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
 114                return -EBUSY;
 115
 116        set_debugreg(info->address, i);
 117        __this_cpu_write(cpu_debugreg[i], info->address);
 118
 119        dr7 = this_cpu_ptr(&cpu_dr7);
 120        *dr7 |= encode_dr7(i, info->len, info->type);
 121
 122        /*
 123         * Ensure we first write cpu_dr7 before we set the DR7 register.
 124         * This ensures an NMI never see cpu_dr7 0 when DR7 is not.
 125         */
 126        barrier();
 127
 128        set_debugreg(*dr7, 7);
 129        if (info->mask)
 130                set_dr_addr_mask(info->mask, i);
 131
 132        return 0;
 133}
 134
 135/*
 136 * Uninstall the breakpoint contained in the given counter.
 137 *
 138 * First we search the debug address register it uses and then we disable
 139 * it.
 140 *
 141 * Atomic: we hold the counter->ctx->lock and we only handle variables
 142 * and registers local to this cpu.
 143 */
 144void arch_uninstall_hw_breakpoint(struct perf_event *bp)
 145{
 146        struct arch_hw_breakpoint *info = counter_arch_bp(bp);
 147        unsigned long dr7;
 148        int i;
 149
 150        lockdep_assert_irqs_disabled();
 151
 152        for (i = 0; i < HBP_NUM; i++) {
 153                struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]);
 154
 155                if (*slot == bp) {
 156                        *slot = NULL;
 157                        break;
 158                }
 159        }
 160
 161        if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
 162                return;
 163
 164        dr7 = this_cpu_read(cpu_dr7);
 165        dr7 &= ~__encode_dr7(i, info->len, info->type);
 166
 167        set_debugreg(dr7, 7);
 168        if (info->mask)
 169                set_dr_addr_mask(0, i);
 170
 171        /*
 172         * Ensure the write to cpu_dr7 is after we've set the DR7 register.
 173         * This ensures an NMI never see cpu_dr7 0 when DR7 is not.
 174         */
 175        barrier();
 176
 177        this_cpu_write(cpu_dr7, dr7);
 178}
 179
 180static int arch_bp_generic_len(int x86_len)
 181{
 182        switch (x86_len) {
 183        case X86_BREAKPOINT_LEN_1:
 184                return HW_BREAKPOINT_LEN_1;
 185        case X86_BREAKPOINT_LEN_2:
 186                return HW_BREAKPOINT_LEN_2;
 187        case X86_BREAKPOINT_LEN_4:
 188                return HW_BREAKPOINT_LEN_4;
 189#ifdef CONFIG_X86_64
 190        case X86_BREAKPOINT_LEN_8:
 191                return HW_BREAKPOINT_LEN_8;
 192#endif
 193        default:
 194                return -EINVAL;
 195        }
 196}
 197
 198int arch_bp_generic_fields(int x86_len, int x86_type,
 199                           int *gen_len, int *gen_type)
 200{
 201        int len;
 202
 203        /* Type */
 204        switch (x86_type) {
 205        case X86_BREAKPOINT_EXECUTE:
 206                if (x86_len != X86_BREAKPOINT_LEN_X)
 207                        return -EINVAL;
 208
 209                *gen_type = HW_BREAKPOINT_X;
 210                *gen_len = sizeof(long);
 211                return 0;
 212        case X86_BREAKPOINT_WRITE:
 213                *gen_type = HW_BREAKPOINT_W;
 214                break;
 215        case X86_BREAKPOINT_RW:
 216                *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
 217                break;
 218        default:
 219                return -EINVAL;
 220        }
 221
 222        /* Len */
 223        len = arch_bp_generic_len(x86_len);
 224        if (len < 0)
 225                return -EINVAL;
 226        *gen_len = len;
 227
 228        return 0;
 229}
 230
 231/*
 232 * Check for virtual address in kernel space.
 233 */
 234int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw)
 235{
 236        unsigned long va;
 237        int len;
 238
 239        va = hw->address;
 240        len = arch_bp_generic_len(hw->len);
 241        WARN_ON_ONCE(len < 0);
 242
 243        /*
 244         * We don't need to worry about va + len - 1 overflowing:
 245         * we already require that va is aligned to a multiple of len.
 246         */
 247        return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX);
 248}
 249
 250/*
 251 * Checks whether the range [addr, end], overlaps the area [base, base + size).
 252 */
 253static inline bool within_area(unsigned long addr, unsigned long end,
 254                               unsigned long base, unsigned long size)
 255{
 256        return end >= base && addr < (base + size);
 257}
 258
 259/*
 260 * Checks whether the range from addr to end, inclusive, overlaps the fixed
 261 * mapped CPU entry area range or other ranges used for CPU entry.
 262 */
 263static inline bool within_cpu_entry(unsigned long addr, unsigned long end)
 264{
 265        int cpu;
 266
 267        /* CPU entry erea is always used for CPU entry */
 268        if (within_area(addr, end, CPU_ENTRY_AREA_BASE,
 269                        CPU_ENTRY_AREA_TOTAL_SIZE))
 270                return true;
 271
 272        /*
 273         * When FSGSBASE is enabled, paranoid_entry() fetches the per-CPU
 274         * GSBASE value via __per_cpu_offset or pcpu_unit_offsets.
 275         */
 276#ifdef CONFIG_SMP
 277        if (within_area(addr, end, (unsigned long)__per_cpu_offset,
 278                        sizeof(unsigned long) * nr_cpu_ids))
 279                return true;
 280#else
 281        if (within_area(addr, end, (unsigned long)&pcpu_unit_offsets,
 282                        sizeof(pcpu_unit_offsets)))
 283                return true;
 284#endif
 285
 286        for_each_possible_cpu(cpu) {
 287                /* The original rw GDT is being used after load_direct_gdt() */
 288                if (within_area(addr, end, (unsigned long)get_cpu_gdt_rw(cpu),
 289                                GDT_SIZE))
 290                        return true;
 291
 292                /*
 293                 * cpu_tss_rw is not directly referenced by hardware, but
 294                 * cpu_tss_rw is also used in CPU entry code,
 295                 */
 296                if (within_area(addr, end,
 297                                (unsigned long)&per_cpu(cpu_tss_rw, cpu),
 298                                sizeof(struct tss_struct)))
 299                        return true;
 300
 301                /*
 302                 * cpu_tlbstate.user_pcid_flush_mask is used for CPU entry.
 303                 * If a data breakpoint on it, it will cause an unwanted #DB.
 304                 * Protect the full cpu_tlbstate structure to be sure.
 305                 */
 306                if (within_area(addr, end,
 307                                (unsigned long)&per_cpu(cpu_tlbstate, cpu),
 308                                sizeof(struct tlb_state)))
 309                        return true;
 310
 311                /*
 312                 * When in guest (X86_FEATURE_HYPERVISOR), local_db_save()
 313                 * will read per-cpu cpu_dr7 before clear dr7 register.
 314                 */
 315                if (within_area(addr, end, (unsigned long)&per_cpu(cpu_dr7, cpu),
 316                                sizeof(cpu_dr7)))
 317                        return true;
 318        }
 319
 320        return false;
 321}
 322
 323static int arch_build_bp_info(struct perf_event *bp,
 324                              const struct perf_event_attr *attr,
 325                              struct arch_hw_breakpoint *hw)
 326{
 327        unsigned long bp_end;
 328
 329        bp_end = attr->bp_addr + attr->bp_len - 1;
 330        if (bp_end < attr->bp_addr)
 331                return -EINVAL;
 332
 333        /*
 334         * Prevent any breakpoint of any type that overlaps the CPU
 335         * entry area and data.  This protects the IST stacks and also
 336         * reduces the chance that we ever find out what happens if
 337         * there's a data breakpoint on the GDT, IDT, or TSS.
 338         */
 339        if (within_cpu_entry(attr->bp_addr, bp_end))
 340                return -EINVAL;
 341
 342        hw->address = attr->bp_addr;
 343        hw->mask = 0;
 344
 345        /* Type */
 346        switch (attr->bp_type) {
 347        case HW_BREAKPOINT_W:
 348                hw->type = X86_BREAKPOINT_WRITE;
 349                break;
 350        case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
 351                hw->type = X86_BREAKPOINT_RW;
 352                break;
 353        case HW_BREAKPOINT_X:
 354                /*
 355                 * We don't allow kernel breakpoints in places that are not
 356                 * acceptable for kprobes.  On non-kprobes kernels, we don't
 357                 * allow kernel breakpoints at all.
 358                 */
 359                if (attr->bp_addr >= TASK_SIZE_MAX) {
 360                        if (within_kprobe_blacklist(attr->bp_addr))
 361                                return -EINVAL;
 362                }
 363
 364                hw->type = X86_BREAKPOINT_EXECUTE;
 365                /*
 366                 * x86 inst breakpoints need to have a specific undefined len.
 367                 * But we still need to check userspace is not trying to setup
 368                 * an unsupported length, to get a range breakpoint for example.
 369                 */
 370                if (attr->bp_len == sizeof(long)) {
 371                        hw->len = X86_BREAKPOINT_LEN_X;
 372                        return 0;
 373                }
 374                fallthrough;
 375        default:
 376                return -EINVAL;
 377        }
 378
 379        /* Len */
 380        switch (attr->bp_len) {
 381        case HW_BREAKPOINT_LEN_1:
 382                hw->len = X86_BREAKPOINT_LEN_1;
 383                break;
 384        case HW_BREAKPOINT_LEN_2:
 385                hw->len = X86_BREAKPOINT_LEN_2;
 386                break;
 387        case HW_BREAKPOINT_LEN_4:
 388                hw->len = X86_BREAKPOINT_LEN_4;
 389                break;
 390#ifdef CONFIG_X86_64
 391        case HW_BREAKPOINT_LEN_8:
 392                hw->len = X86_BREAKPOINT_LEN_8;
 393                break;
 394#endif
 395        default:
 396                /* AMD range breakpoint */
 397                if (!is_power_of_2(attr->bp_len))
 398                        return -EINVAL;
 399                if (attr->bp_addr & (attr->bp_len - 1))
 400                        return -EINVAL;
 401
 402                if (!boot_cpu_has(X86_FEATURE_BPEXT))
 403                        return -EOPNOTSUPP;
 404
 405                /*
 406                 * It's impossible to use a range breakpoint to fake out
 407                 * user vs kernel detection because bp_len - 1 can't
 408                 * have the high bit set.  If we ever allow range instruction
 409                 * breakpoints, then we'll have to check for kprobe-blacklisted
 410                 * addresses anywhere in the range.
 411                 */
 412                hw->mask = attr->bp_len - 1;
 413                hw->len = X86_BREAKPOINT_LEN_1;
 414        }
 415
 416        return 0;
 417}
 418
 419/*
 420 * Validate the arch-specific HW Breakpoint register settings
 421 */
 422int hw_breakpoint_arch_parse(struct perf_event *bp,
 423                             const struct perf_event_attr *attr,
 424                             struct arch_hw_breakpoint *hw)
 425{
 426        unsigned int align;
 427        int ret;
 428
 429
 430        ret = arch_build_bp_info(bp, attr, hw);
 431        if (ret)
 432                return ret;
 433
 434        switch (hw->len) {
 435        case X86_BREAKPOINT_LEN_1:
 436                align = 0;
 437                if (hw->mask)
 438                        align = hw->mask;
 439                break;
 440        case X86_BREAKPOINT_LEN_2:
 441                align = 1;
 442                break;
 443        case X86_BREAKPOINT_LEN_4:
 444                align = 3;
 445                break;
 446#ifdef CONFIG_X86_64
 447        case X86_BREAKPOINT_LEN_8:
 448                align = 7;
 449                break;
 450#endif
 451        default:
 452                WARN_ON_ONCE(1);
 453                return -EINVAL;
 454        }
 455
 456        /*
 457         * Check that the low-order bits of the address are appropriate
 458         * for the alignment implied by len.
 459         */
 460        if (hw->address & align)
 461                return -EINVAL;
 462
 463        return 0;
 464}
 465
 466/*
 467 * Release the user breakpoints used by ptrace
 468 */
 469void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
 470{
 471        int i;
 472        struct thread_struct *t = &tsk->thread;
 473
 474        for (i = 0; i < HBP_NUM; i++) {
 475                unregister_hw_breakpoint(t->ptrace_bps[i]);
 476                t->ptrace_bps[i] = NULL;
 477        }
 478
 479        t->virtual_dr6 = 0;
 480        t->ptrace_dr7 = 0;
 481}
 482
 483void hw_breakpoint_restore(void)
 484{
 485        set_debugreg(__this_cpu_read(cpu_debugreg[0]), 0);
 486        set_debugreg(__this_cpu_read(cpu_debugreg[1]), 1);
 487        set_debugreg(__this_cpu_read(cpu_debugreg[2]), 2);
 488        set_debugreg(__this_cpu_read(cpu_debugreg[3]), 3);
 489        set_debugreg(DR6_RESERVED, 6);
 490        set_debugreg(__this_cpu_read(cpu_dr7), 7);
 491}
 492EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
 493
 494/*
 495 * Handle debug exception notifications.
 496 *
 497 * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below.
 498 *
 499 * NOTIFY_DONE returned if one of the following conditions is true.
 500 * i) When the causative address is from user-space and the exception
 501 * is a valid one, i.e. not triggered as a result of lazy debug register
 502 * switching
 503 * ii) When there are more bits than trap<n> set in DR6 register (such
 504 * as BD, BS or BT) indicating that more than one debug condition is
 505 * met and requires some more action in do_debug().
 506 *
 507 * NOTIFY_STOP returned for all other cases
 508 *
 509 */
 510static int hw_breakpoint_handler(struct die_args *args)
 511{
 512        int i, rc = NOTIFY_STOP;
 513        struct perf_event *bp;
 514        unsigned long *dr6_p;
 515        unsigned long dr6;
 516        bool bpx;
 517
 518        /* The DR6 value is pointed by args->err */
 519        dr6_p = (unsigned long *)ERR_PTR(args->err);
 520        dr6 = *dr6_p;
 521
 522        /* Do an early return if no trap bits are set in DR6 */
 523        if ((dr6 & DR_TRAP_BITS) == 0)
 524                return NOTIFY_DONE;
 525
 526        /* Handle all the breakpoints that were triggered */
 527        for (i = 0; i < HBP_NUM; ++i) {
 528                if (likely(!(dr6 & (DR_TRAP0 << i))))
 529                        continue;
 530
 531                bp = this_cpu_read(bp_per_reg[i]);
 532                if (!bp)
 533                        continue;
 534
 535                bpx = bp->hw.info.type == X86_BREAKPOINT_EXECUTE;
 536
 537                /*
 538                 * TF and data breakpoints are traps and can be merged, however
 539                 * instruction breakpoints are faults and will be raised
 540                 * separately.
 541                 *
 542                 * However DR6 can indicate both TF and instruction
 543                 * breakpoints. In that case take TF as that has precedence and
 544                 * delay the instruction breakpoint for the next exception.
 545                 */
 546                if (bpx && (dr6 & DR_STEP))
 547                        continue;
 548
 549                /*
 550                 * Reset the 'i'th TRAP bit in dr6 to denote completion of
 551                 * exception handling
 552                 */
 553                (*dr6_p) &= ~(DR_TRAP0 << i);
 554
 555                perf_bp_event(bp, args->regs);
 556
 557                /*
 558                 * Set up resume flag to avoid breakpoint recursion when
 559                 * returning back to origin.
 560                 */
 561                if (bpx)
 562                        args->regs->flags |= X86_EFLAGS_RF;
 563        }
 564
 565        /*
 566         * Further processing in do_debug() is needed for a) user-space
 567         * breakpoints (to generate signals) and b) when the system has
 568         * taken exception due to multiple causes
 569         */
 570        if ((current->thread.virtual_dr6 & DR_TRAP_BITS) ||
 571            (dr6 & (~DR_TRAP_BITS)))
 572                rc = NOTIFY_DONE;
 573
 574        return rc;
 575}
 576
 577/*
 578 * Handle debug exception notifications.
 579 */
 580int hw_breakpoint_exceptions_notify(
 581                struct notifier_block *unused, unsigned long val, void *data)
 582{
 583        if (val != DIE_DEBUG)
 584                return NOTIFY_DONE;
 585
 586        return hw_breakpoint_handler(data);
 587}
 588
 589void hw_breakpoint_pmu_read(struct perf_event *bp)
 590{
 591        /* TODO */
 592}
 593