linux/security/commoncap.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/* Common capabilities, needed by capability.o.
   3 */
   4
   5#include <linux/capability.h>
   6#include <linux/audit.h>
   7#include <linux/init.h>
   8#include <linux/kernel.h>
   9#include <linux/lsm_hooks.h>
  10#include <linux/file.h>
  11#include <linux/mm.h>
  12#include <linux/mman.h>
  13#include <linux/pagemap.h>
  14#include <linux/swap.h>
  15#include <linux/skbuff.h>
  16#include <linux/netlink.h>
  17#include <linux/ptrace.h>
  18#include <linux/xattr.h>
  19#include <linux/hugetlb.h>
  20#include <linux/mount.h>
  21#include <linux/sched.h>
  22#include <linux/prctl.h>
  23#include <linux/securebits.h>
  24#include <linux/user_namespace.h>
  25#include <linux/binfmts.h>
  26#include <linux/personality.h>
  27#include <linux/mnt_idmapping.h>
  28
  29/*
  30 * If a non-root user executes a setuid-root binary in
  31 * !secure(SECURE_NOROOT) mode, then we raise capabilities.
  32 * However if fE is also set, then the intent is for only
  33 * the file capabilities to be applied, and the setuid-root
  34 * bit is left on either to change the uid (plausible) or
  35 * to get full privilege on a kernel without file capabilities
  36 * support.  So in that case we do not raise capabilities.
  37 *
  38 * Warn if that happens, once per boot.
  39 */
  40static void warn_setuid_and_fcaps_mixed(const char *fname)
  41{
  42        static int warned;
  43        if (!warned) {
  44                printk(KERN_INFO "warning: `%s' has both setuid-root and"
  45                        " effective capabilities. Therefore not raising all"
  46                        " capabilities.\n", fname);
  47                warned = 1;
  48        }
  49}
  50
  51/**
  52 * cap_capable - Determine whether a task has a particular effective capability
  53 * @cred: The credentials to use
  54 * @targ_ns:  The user namespace in which we need the capability
  55 * @cap: The capability to check for
  56 * @opts: Bitmask of options defined in include/linux/security.h
  57 *
  58 * Determine whether the nominated task has the specified capability amongst
  59 * its effective set, returning 0 if it does, -ve if it does not.
  60 *
  61 * NOTE WELL: cap_has_capability() cannot be used like the kernel's capable()
  62 * and has_capability() functions.  That is, it has the reverse semantics:
  63 * cap_has_capability() returns 0 when a task has a capability, but the
  64 * kernel's capable() and has_capability() returns 1 for this case.
  65 */
  66int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
  67                int cap, unsigned int opts)
  68{
  69        struct user_namespace *ns = targ_ns;
  70
  71        /* See if cred has the capability in the target user namespace
  72         * by examining the target user namespace and all of the target
  73         * user namespace's parents.
  74         */
  75        for (;;) {
  76                /* Do we have the necessary capabilities? */
  77                if (ns == cred->user_ns)
  78                        return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;
  79
  80                /*
  81                 * If we're already at a lower level than we're looking for,
  82                 * we're done searching.
  83                 */
  84                if (ns->level <= cred->user_ns->level)
  85                        return -EPERM;
  86
  87                /* 
  88                 * The owner of the user namespace in the parent of the
  89                 * user namespace has all caps.
  90                 */
  91                if ((ns->parent == cred->user_ns) && uid_eq(ns->owner, cred->euid))
  92                        return 0;
  93
  94                /*
  95                 * If you have a capability in a parent user ns, then you have
  96                 * it over all children user namespaces as well.
  97                 */
  98                ns = ns->parent;
  99        }
 100
 101        /* We never get here */
 102}
 103
 104/**
 105 * cap_settime - Determine whether the current process may set the system clock
 106 * @ts: The time to set
 107 * @tz: The timezone to set
 108 *
 109 * Determine whether the current process may set the system clock and timezone
 110 * information, returning 0 if permission granted, -ve if denied.
 111 */
 112int cap_settime(const struct timespec64 *ts, const struct timezone *tz)
 113{
 114        if (!capable(CAP_SYS_TIME))
 115                return -EPERM;
 116        return 0;
 117}
 118
 119/**
 120 * cap_ptrace_access_check - Determine whether the current process may access
 121 *                         another
 122 * @child: The process to be accessed
 123 * @mode: The mode of attachment.
 124 *
 125 * If we are in the same or an ancestor user_ns and have all the target
 126 * task's capabilities, then ptrace access is allowed.
 127 * If we have the ptrace capability to the target user_ns, then ptrace
 128 * access is allowed.
 129 * Else denied.
 130 *
 131 * Determine whether a process may access another, returning 0 if permission
 132 * granted, -ve if denied.
 133 */
 134int cap_ptrace_access_check(struct task_struct *child, unsigned int mode)
 135{
 136        int ret = 0;
 137        const struct cred *cred, *child_cred;
 138        const kernel_cap_t *caller_caps;
 139
 140        rcu_read_lock();
 141        cred = current_cred();
 142        child_cred = __task_cred(child);
 143        if (mode & PTRACE_MODE_FSCREDS)
 144                caller_caps = &cred->cap_effective;
 145        else
 146                caller_caps = &cred->cap_permitted;
 147        if (cred->user_ns == child_cred->user_ns &&
 148            cap_issubset(child_cred->cap_permitted, *caller_caps))
 149                goto out;
 150        if (ns_capable(child_cred->user_ns, CAP_SYS_PTRACE))
 151                goto out;
 152        ret = -EPERM;
 153out:
 154        rcu_read_unlock();
 155        return ret;
 156}
 157
 158/**
 159 * cap_ptrace_traceme - Determine whether another process may trace the current
 160 * @parent: The task proposed to be the tracer
 161 *
 162 * If parent is in the same or an ancestor user_ns and has all current's
 163 * capabilities, then ptrace access is allowed.
 164 * If parent has the ptrace capability to current's user_ns, then ptrace
 165 * access is allowed.
 166 * Else denied.
 167 *
 168 * Determine whether the nominated task is permitted to trace the current
 169 * process, returning 0 if permission is granted, -ve if denied.
 170 */
 171int cap_ptrace_traceme(struct task_struct *parent)
 172{
 173        int ret = 0;
 174        const struct cred *cred, *child_cred;
 175
 176        rcu_read_lock();
 177        cred = __task_cred(parent);
 178        child_cred = current_cred();
 179        if (cred->user_ns == child_cred->user_ns &&
 180            cap_issubset(child_cred->cap_permitted, cred->cap_permitted))
 181                goto out;
 182        if (has_ns_capability(parent, child_cred->user_ns, CAP_SYS_PTRACE))
 183                goto out;
 184        ret = -EPERM;
 185out:
 186        rcu_read_unlock();
 187        return ret;
 188}
 189
 190/**
 191 * cap_capget - Retrieve a task's capability sets
 192 * @target: The task from which to retrieve the capability sets
 193 * @effective: The place to record the effective set
 194 * @inheritable: The place to record the inheritable set
 195 * @permitted: The place to record the permitted set
 196 *
 197 * This function retrieves the capabilities of the nominated task and returns
 198 * them to the caller.
 199 */
 200int cap_capget(const struct task_struct *target, kernel_cap_t *effective,
 201               kernel_cap_t *inheritable, kernel_cap_t *permitted)
 202{
 203        const struct cred *cred;
 204
 205        /* Derived from kernel/capability.c:sys_capget. */
 206        rcu_read_lock();
 207        cred = __task_cred(target);
 208        *effective   = cred->cap_effective;
 209        *inheritable = cred->cap_inheritable;
 210        *permitted   = cred->cap_permitted;
 211        rcu_read_unlock();
 212        return 0;
 213}
 214
 215/*
 216 * Determine whether the inheritable capabilities are limited to the old
 217 * permitted set.  Returns 1 if they are limited, 0 if they are not.
 218 */
 219static inline int cap_inh_is_capped(void)
 220{
 221        /* they are so limited unless the current task has the CAP_SETPCAP
 222         * capability
 223         */
 224        if (cap_capable(current_cred(), current_cred()->user_ns,
 225                        CAP_SETPCAP, CAP_OPT_NONE) == 0)
 226                return 0;
 227        return 1;
 228}
 229
 230/**
 231 * cap_capset - Validate and apply proposed changes to current's capabilities
 232 * @new: The proposed new credentials; alterations should be made here
 233 * @old: The current task's current credentials
 234 * @effective: A pointer to the proposed new effective capabilities set
 235 * @inheritable: A pointer to the proposed new inheritable capabilities set
 236 * @permitted: A pointer to the proposed new permitted capabilities set
 237 *
 238 * This function validates and applies a proposed mass change to the current
 239 * process's capability sets.  The changes are made to the proposed new
 240 * credentials, and assuming no error, will be committed by the caller of LSM.
 241 */
 242int cap_capset(struct cred *new,
 243               const struct cred *old,
 244               const kernel_cap_t *effective,
 245               const kernel_cap_t *inheritable,
 246               const kernel_cap_t *permitted)
 247{
 248        if (cap_inh_is_capped() &&
 249            !cap_issubset(*inheritable,
 250                          cap_combine(old->cap_inheritable,
 251                                      old->cap_permitted)))
 252                /* incapable of using this inheritable set */
 253                return -EPERM;
 254
 255        if (!cap_issubset(*inheritable,
 256                          cap_combine(old->cap_inheritable,
 257                                      old->cap_bset)))
 258                /* no new pI capabilities outside bounding set */
 259                return -EPERM;
 260
 261        /* verify restrictions on target's new Permitted set */
 262        if (!cap_issubset(*permitted, old->cap_permitted))
 263                return -EPERM;
 264
 265        /* verify the _new_Effective_ is a subset of the _new_Permitted_ */
 266        if (!cap_issubset(*effective, *permitted))
 267                return -EPERM;
 268
 269        new->cap_effective   = *effective;
 270        new->cap_inheritable = *inheritable;
 271        new->cap_permitted   = *permitted;
 272
 273        /*
 274         * Mask off ambient bits that are no longer both permitted and
 275         * inheritable.
 276         */
 277        new->cap_ambient = cap_intersect(new->cap_ambient,
 278                                         cap_intersect(*permitted,
 279                                                       *inheritable));
 280        if (WARN_ON(!cap_ambient_invariant_ok(new)))
 281                return -EINVAL;
 282        return 0;
 283}
 284
 285/**
 286 * cap_inode_need_killpriv - Determine if inode change affects privileges
 287 * @dentry: The inode/dentry in being changed with change marked ATTR_KILL_PRIV
 288 *
 289 * Determine if an inode having a change applied that's marked ATTR_KILL_PRIV
 290 * affects the security markings on that inode, and if it is, should
 291 * inode_killpriv() be invoked or the change rejected.
 292 *
 293 * Return: 1 if security.capability has a value, meaning inode_killpriv()
 294 * is required, 0 otherwise, meaning inode_killpriv() is not required.
 295 */
 296int cap_inode_need_killpriv(struct dentry *dentry)
 297{
 298        struct inode *inode = d_backing_inode(dentry);
 299        int error;
 300
 301        error = __vfs_getxattr(dentry, inode, XATTR_NAME_CAPS, NULL, 0);
 302        return error > 0;
 303}
 304
 305/**
 306 * cap_inode_killpriv - Erase the security markings on an inode
 307 *
 308 * @idmap:      idmap of the mount the inode was found from
 309 * @dentry:     The inode/dentry to alter
 310 *
 311 * Erase the privilege-enhancing security markings on an inode.
 312 *
 313 * If the inode has been found through an idmapped mount the idmap of
 314 * the vfsmount must be passed through @idmap. This function will then
 315 * take care to map the inode according to @idmap before checking
 316 * permissions. On non-idmapped mounts or if permission checking is to be
 317 * performed on the raw inode simply pass @nop_mnt_idmap.
 318 *
 319 * Return: 0 if successful, -ve on error.
 320 */
 321int cap_inode_killpriv(struct mnt_idmap *idmap, struct dentry *dentry)
 322{
 323        int error;
 324
 325        error = __vfs_removexattr(idmap, dentry, XATTR_NAME_CAPS);
 326        if (error == -EOPNOTSUPP)
 327                error = 0;
 328        return error;
 329}
 330
 331static bool rootid_owns_currentns(vfsuid_t rootvfsuid)
 332{
 333        struct user_namespace *ns;
 334        kuid_t kroot;
 335
 336        if (!vfsuid_valid(rootvfsuid))
 337                return false;
 338
 339        kroot = vfsuid_into_kuid(rootvfsuid);
 340        for (ns = current_user_ns();; ns = ns->parent) {
 341                if (from_kuid(ns, kroot) == 0)
 342                        return true;
 343                if (ns == &init_user_ns)
 344                        break;
 345        }
 346
 347        return false;
 348}
 349
 350static __u32 sansflags(__u32 m)
 351{
 352        return m & ~VFS_CAP_FLAGS_EFFECTIVE;
 353}
 354
 355static bool is_v2header(int size, const struct vfs_cap_data *cap)
 356{
 357        if (size != XATTR_CAPS_SZ_2)
 358                return false;
 359        return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_2;
 360}
 361
 362static bool is_v3header(int size, const struct vfs_cap_data *cap)
 363{
 364        if (size != XATTR_CAPS_SZ_3)
 365                return false;
 366        return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_3;
 367}
 368
 369/*
 370 * getsecurity: We are called for security.* before any attempt to read the
 371 * xattr from the inode itself.
 372 *
 373 * This gives us a chance to read the on-disk value and convert it.  If we
 374 * return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler.
 375 *
 376 * Note we are not called by vfs_getxattr_alloc(), but that is only called
 377 * by the integrity subsystem, which really wants the unconverted values -
 378 * so that's good.
 379 */
 380int cap_inode_getsecurity(struct mnt_idmap *idmap,
 381                          struct inode *inode, const char *name, void **buffer,
 382                          bool alloc)
 383{
 384        int size;
 385        kuid_t kroot;
 386        vfsuid_t vfsroot;
 387        u32 nsmagic, magic;
 388        uid_t root, mappedroot;
 389        char *tmpbuf = NULL;
 390        struct vfs_cap_data *cap;
 391        struct vfs_ns_cap_data *nscap = NULL;
 392        struct dentry *dentry;
 393        struct user_namespace *fs_ns;
 394
 395        if (strcmp(name, "capability") != 0)
 396                return -EOPNOTSUPP;
 397
 398        dentry = d_find_any_alias(inode);
 399        if (!dentry)
 400                return -EINVAL;
 401        size = vfs_getxattr_alloc(idmap, dentry, XATTR_NAME_CAPS, &tmpbuf,
 402                                  sizeof(struct vfs_ns_cap_data), GFP_NOFS);
 403        dput(dentry);
 404        /* gcc11 complains if we don't check for !tmpbuf */
 405        if (size < 0 || !tmpbuf)
 406                goto out_free;
 407
 408        fs_ns = inode->i_sb->s_user_ns;
 409        cap = (struct vfs_cap_data *) tmpbuf;
 410        if (is_v2header(size, cap)) {
 411                root = 0;
 412        } else if (is_v3header(size, cap)) {
 413                nscap = (struct vfs_ns_cap_data *) tmpbuf;
 414                root = le32_to_cpu(nscap->rootid);
 415        } else {
 416                size = -EINVAL;
 417                goto out_free;
 418        }
 419
 420        kroot = make_kuid(fs_ns, root);
 421
 422        /* If this is an idmapped mount shift the kuid. */
 423        vfsroot = make_vfsuid(idmap, fs_ns, kroot);
 424
 425        /* If the root kuid maps to a valid uid in current ns, then return
 426         * this as a nscap. */
 427        mappedroot = from_kuid(current_user_ns(), vfsuid_into_kuid(vfsroot));
 428        if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) {
 429                size = sizeof(struct vfs_ns_cap_data);
 430                if (alloc) {
 431                        if (!nscap) {
 432                                /* v2 -> v3 conversion */
 433                                nscap = kzalloc(size, GFP_ATOMIC);
 434                                if (!nscap) {
 435                                        size = -ENOMEM;
 436                                        goto out_free;
 437                                }
 438                                nsmagic = VFS_CAP_REVISION_3;
 439                                magic = le32_to_cpu(cap->magic_etc);
 440                                if (magic & VFS_CAP_FLAGS_EFFECTIVE)
 441                                        nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
 442                                memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
 443                                nscap->magic_etc = cpu_to_le32(nsmagic);
 444                        } else {
 445                                /* use allocated v3 buffer */
 446                                tmpbuf = NULL;
 447                        }
 448                        nscap->rootid = cpu_to_le32(mappedroot);
 449                        *buffer = nscap;
 450                }
 451                goto out_free;
 452        }
 453
 454        if (!rootid_owns_currentns(vfsroot)) {
 455                size = -EOVERFLOW;
 456                goto out_free;
 457        }
 458
 459        /* This comes from a parent namespace.  Return as a v2 capability */
 460        size = sizeof(struct vfs_cap_data);
 461        if (alloc) {
 462                if (nscap) {
 463                        /* v3 -> v2 conversion */
 464                        cap = kzalloc(size, GFP_ATOMIC);
 465                        if (!cap) {
 466                                size = -ENOMEM;
 467                                goto out_free;
 468                        }
 469                        magic = VFS_CAP_REVISION_2;
 470                        nsmagic = le32_to_cpu(nscap->magic_etc);
 471                        if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE)
 472                                magic |= VFS_CAP_FLAGS_EFFECTIVE;
 473                        memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
 474                        cap->magic_etc = cpu_to_le32(magic);
 475                } else {
 476                        /* use unconverted v2 */
 477                        tmpbuf = NULL;
 478                }
 479                *buffer = cap;
 480        }
 481out_free:
 482        kfree(tmpbuf);
 483        return size;
 484}
 485
 486/**
 487 * rootid_from_xattr - translate root uid of vfs caps
 488 *
 489 * @value:      vfs caps value which may be modified by this function
 490 * @size:       size of @ivalue
 491 * @task_ns:    user namespace of the caller
 492 */
 493static vfsuid_t rootid_from_xattr(const void *value, size_t size,
 494                                  struct user_namespace *task_ns)
 495{
 496        const struct vfs_ns_cap_data *nscap = value;
 497        uid_t rootid = 0;
 498
 499        if (size == XATTR_CAPS_SZ_3)
 500                rootid = le32_to_cpu(nscap->rootid);
 501
 502        return VFSUIDT_INIT(make_kuid(task_ns, rootid));
 503}
 504
 505static bool validheader(size_t size, const struct vfs_cap_data *cap)
 506{
 507        return is_v2header(size, cap) || is_v3header(size, cap);
 508}
 509
 510/**
 511 * cap_convert_nscap - check vfs caps
 512 *
 513 * @idmap:      idmap of the mount the inode was found from
 514 * @dentry:     used to retrieve inode to check permissions on
 515 * @ivalue:     vfs caps value which may be modified by this function
 516 * @size:       size of @ivalue
 517 *
 518 * User requested a write of security.capability.  If needed, update the
 519 * xattr to change from v2 to v3, or to fixup the v3 rootid.
 520 *
 521 * If the inode has been found through an idmapped mount the idmap of
 522 * the vfsmount must be passed through @idmap. This function will then
 523 * take care to map the inode according to @idmap before checking
 524 * permissions. On non-idmapped mounts or if permission checking is to be
 525 * performed on the raw inode simply pass @nop_mnt_idmap.
 526 *
 527 * Return: On success, return the new size; on error, return < 0.
 528 */
 529int cap_convert_nscap(struct mnt_idmap *idmap, struct dentry *dentry,
 530                      const void **ivalue, size_t size)
 531{
 532        struct vfs_ns_cap_data *nscap;
 533        uid_t nsrootid;
 534        const struct vfs_cap_data *cap = *ivalue;
 535        __u32 magic, nsmagic;
 536        struct inode *inode = d_backing_inode(dentry);
 537        struct user_namespace *task_ns = current_user_ns(),
 538                *fs_ns = inode->i_sb->s_user_ns;
 539        kuid_t rootid;
 540        vfsuid_t vfsrootid;
 541        size_t newsize;
 542
 543        if (!*ivalue)
 544                return -EINVAL;
 545        if (!validheader(size, cap))
 546                return -EINVAL;
 547        if (!capable_wrt_inode_uidgid(idmap, inode, CAP_SETFCAP))
 548                return -EPERM;
 549        if (size == XATTR_CAPS_SZ_2 && (idmap == &nop_mnt_idmap))
 550                if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP))
 551                        /* user is privileged, just write the v2 */
 552                        return size;
 553
 554        vfsrootid = rootid_from_xattr(*ivalue, size, task_ns);
 555        if (!vfsuid_valid(vfsrootid))
 556                return -EINVAL;
 557
 558        rootid = from_vfsuid(idmap, fs_ns, vfsrootid);
 559        if (!uid_valid(rootid))
 560                return -EINVAL;
 561
 562        nsrootid = from_kuid(fs_ns, rootid);
 563        if (nsrootid == -1)
 564                return -EINVAL;
 565
 566        newsize = sizeof(struct vfs_ns_cap_data);
 567        nscap = kmalloc(newsize, GFP_ATOMIC);
 568        if (!nscap)
 569                return -ENOMEM;
 570        nscap->rootid = cpu_to_le32(nsrootid);
 571        nsmagic = VFS_CAP_REVISION_3;
 572        magic = le32_to_cpu(cap->magic_etc);
 573        if (magic & VFS_CAP_FLAGS_EFFECTIVE)
 574                nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
 575        nscap->magic_etc = cpu_to_le32(nsmagic);
 576        memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
 577
 578        *ivalue = nscap;
 579        return newsize;
 580}
 581
 582/*
 583 * Calculate the new process capability sets from the capability sets attached
 584 * to a file.
 585 */
 586static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
 587                                          struct linux_binprm *bprm,
 588                                          bool *effective,
 589                                          bool *has_fcap)
 590{
 591        struct cred *new = bprm->cred;
 592        int ret = 0;
 593
 594        if (caps->magic_etc & VFS_CAP_FLAGS_EFFECTIVE)
 595                *effective = true;
 596
 597        if (caps->magic_etc & VFS_CAP_REVISION_MASK)
 598                *has_fcap = true;
 599
 600        /*
 601         * pP' = (X & fP) | (pI & fI)
 602         * The addition of pA' is handled later.
 603         */
 604        new->cap_permitted.val =
 605                (new->cap_bset.val & caps->permitted.val) |
 606                (new->cap_inheritable.val & caps->inheritable.val);
 607
 608        if (caps->permitted.val & ~new->cap_permitted.val)
 609                /* insufficient to execute correctly */
 610                ret = -EPERM;
 611
 612        /*
 613         * For legacy apps, with no internal support for recognizing they
 614         * do not have enough capabilities, we return an error if they are
 615         * missing some "forced" (aka file-permitted) capabilities.
 616         */
 617        return *effective ? ret : 0;
 618}
 619
 620/**
 621 * get_vfs_caps_from_disk - retrieve vfs caps from disk
 622 *
 623 * @idmap:      idmap of the mount the inode was found from
 624 * @dentry:     dentry from which @inode is retrieved
 625 * @cpu_caps:   vfs capabilities
 626 *
 627 * Extract the on-exec-apply capability sets for an executable file.
 628 *
 629 * If the inode has been found through an idmapped mount the idmap of
 630 * the vfsmount must be passed through @idmap. This function will then
 631 * take care to map the inode according to @idmap before checking
 632 * permissions. On non-idmapped mounts or if permission checking is to be
 633 * performed on the raw inode simply pass @nop_mnt_idmap.
 634 */
 635int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
 636                           const struct dentry *dentry,
 637                           struct cpu_vfs_cap_data *cpu_caps)
 638{
 639        struct inode *inode = d_backing_inode(dentry);
 640        __u32 magic_etc;
 641        int size;
 642        struct vfs_ns_cap_data data, *nscaps = &data;
 643        struct vfs_cap_data *caps = (struct vfs_cap_data *) &data;
 644        kuid_t rootkuid;
 645        vfsuid_t rootvfsuid;
 646        struct user_namespace *fs_ns;
 647
 648        memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data));
 649
 650        if (!inode)
 651                return -ENODATA;
 652
 653        fs_ns = inode->i_sb->s_user_ns;
 654        size = __vfs_getxattr((struct dentry *)dentry, inode,
 655                              XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
 656        if (size == -ENODATA || size == -EOPNOTSUPP)
 657                /* no data, that's ok */
 658                return -ENODATA;
 659
 660        if (size < 0)
 661                return size;
 662
 663        if (size < sizeof(magic_etc))
 664                return -EINVAL;
 665
 666        cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);
 667
 668        rootkuid = make_kuid(fs_ns, 0);
 669        switch (magic_etc & VFS_CAP_REVISION_MASK) {
 670        case VFS_CAP_REVISION_1:
 671                if (size != XATTR_CAPS_SZ_1)
 672                        return -EINVAL;
 673                break;
 674        case VFS_CAP_REVISION_2:
 675                if (size != XATTR_CAPS_SZ_2)
 676                        return -EINVAL;
 677                break;
 678        case VFS_CAP_REVISION_3:
 679                if (size != XATTR_CAPS_SZ_3)
 680                        return -EINVAL;
 681                rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid));
 682                break;
 683
 684        default:
 685                return -EINVAL;
 686        }
 687
 688        rootvfsuid = make_vfsuid(idmap, fs_ns, rootkuid);
 689        if (!vfsuid_valid(rootvfsuid))
 690                return -ENODATA;
 691
 692        /* Limit the caps to the mounter of the filesystem
 693         * or the more limited uid specified in the xattr.
 694         */
 695        if (!rootid_owns_currentns(rootvfsuid))
 696                return -ENODATA;
 697
 698        cpu_caps->permitted.val = le32_to_cpu(caps->data[0].permitted);
 699        cpu_caps->inheritable.val = le32_to_cpu(caps->data[0].inheritable);
 700
 701        /*
 702         * Rev1 had just a single 32-bit word, later expanded
 703         * to a second one for the high bits
 704         */
 705        if ((magic_etc & VFS_CAP_REVISION_MASK) != VFS_CAP_REVISION_1) {
 706                cpu_caps->permitted.val += (u64)le32_to_cpu(caps->data[1].permitted) << 32;
 707                cpu_caps->inheritable.val += (u64)le32_to_cpu(caps->data[1].inheritable) << 32;
 708        }
 709
 710        cpu_caps->permitted.val &= CAP_VALID_MASK;
 711        cpu_caps->inheritable.val &= CAP_VALID_MASK;
 712
 713        cpu_caps->rootid = vfsuid_into_kuid(rootvfsuid);
 714
 715        return 0;
 716}
 717
 718/*
 719 * Attempt to get the on-exec apply capability sets for an executable file from
 720 * its xattrs and, if present, apply them to the proposed credentials being
 721 * constructed by execve().
 722 */
 723static int get_file_caps(struct linux_binprm *bprm, struct file *file,
 724                         bool *effective, bool *has_fcap)
 725{
 726        int rc = 0;
 727        struct cpu_vfs_cap_data vcaps;
 728
 729        cap_clear(bprm->cred->cap_permitted);
 730
 731        if (!file_caps_enabled)
 732                return 0;
 733
 734        if (!mnt_may_suid(file->f_path.mnt))
 735                return 0;
 736
 737        /*
 738         * This check is redundant with mnt_may_suid() but is kept to make
 739         * explicit that capability bits are limited to s_user_ns and its
 740         * descendants.
 741         */
 742        if (!current_in_userns(file->f_path.mnt->mnt_sb->s_user_ns))
 743                return 0;
 744
 745        rc = get_vfs_caps_from_disk(file_mnt_idmap(file),
 746                                    file->f_path.dentry, &vcaps);
 747        if (rc < 0) {
 748                if (rc == -EINVAL)
 749                        printk(KERN_NOTICE "Invalid argument reading file caps for %s\n",
 750                                        bprm->filename);
 751                else if (rc == -ENODATA)
 752                        rc = 0;
 753                goto out;
 754        }
 755
 756        rc = bprm_caps_from_vfs_caps(&vcaps, bprm, effective, has_fcap);
 757
 758out:
 759        if (rc)
 760                cap_clear(bprm->cred->cap_permitted);
 761
 762        return rc;
 763}
 764
 765static inline bool root_privileged(void) { return !issecure(SECURE_NOROOT); }
 766
 767static inline bool __is_real(kuid_t uid, struct cred *cred)
 768{ return uid_eq(cred->uid, uid); }
 769
 770static inline bool __is_eff(kuid_t uid, struct cred *cred)
 771{ return uid_eq(cred->euid, uid); }
 772
 773static inline bool __is_suid(kuid_t uid, struct cred *cred)
 774{ return !__is_real(uid, cred) && __is_eff(uid, cred); }
 775
 776/*
 777 * handle_privileged_root - Handle case of privileged root
 778 * @bprm: The execution parameters, including the proposed creds
 779 * @has_fcap: Are any file capabilities set?
 780 * @effective: Do we have effective root privilege?
 781 * @root_uid: This namespace' root UID WRT initial USER namespace
 782 *
 783 * Handle the case where root is privileged and hasn't been neutered by
 784 * SECURE_NOROOT.  If file capabilities are set, they won't be combined with
 785 * set UID root and nothing is changed.  If we are root, cap_permitted is
 786 * updated.  If we have become set UID root, the effective bit is set.
 787 */
 788static void handle_privileged_root(struct linux_binprm *bprm, bool has_fcap,
 789                                   bool *effective, kuid_t root_uid)
 790{
 791        const struct cred *old = current_cred();
 792        struct cred *new = bprm->cred;
 793
 794        if (!root_privileged())
 795                return;
 796        /*
 797         * If the legacy file capability is set, then don't set privs
 798         * for a setuid root binary run by a non-root user.  Do set it
 799         * for a root user just to cause least surprise to an admin.
 800         */
 801        if (has_fcap && __is_suid(root_uid, new)) {
 802                warn_setuid_and_fcaps_mixed(bprm->filename);
 803                return;
 804        }
 805        /*
 806         * To support inheritance of root-permissions and suid-root
 807         * executables under compatibility mode, we override the
 808         * capability sets for the file.
 809         */
 810        if (__is_eff(root_uid, new) || __is_real(root_uid, new)) {
 811                /* pP' = (cap_bset & ~0) | (pI & ~0) */
 812                new->cap_permitted = cap_combine(old->cap_bset,
 813                                                 old->cap_inheritable);
 814        }
 815        /*
 816         * If only the real uid is 0, we do not set the effective bit.
 817         */
 818        if (__is_eff(root_uid, new))
 819                *effective = true;
 820}
 821
 822#define __cap_gained(field, target, source) \
 823        !cap_issubset(target->cap_##field, source->cap_##field)
 824#define __cap_grew(target, source, cred) \
 825        !cap_issubset(cred->cap_##target, cred->cap_##source)
 826#define __cap_full(field, cred) \
 827        cap_issubset(CAP_FULL_SET, cred->cap_##field)
 828
 829static inline bool __is_setuid(struct cred *new, const struct cred *old)
 830{ return !uid_eq(new->euid, old->uid); }
 831
 832static inline bool __is_setgid(struct cred *new, const struct cred *old)
 833{ return !gid_eq(new->egid, old->gid); }
 834
 835/*
 836 * 1) Audit candidate if current->cap_effective is set
 837 *
 838 * We do not bother to audit if 3 things are true:
 839 *   1) cap_effective has all caps
 840 *   2) we became root *OR* are were already root
 841 *   3) root is supposed to have all caps (SECURE_NOROOT)
 842 * Since this is just a normal root execing a process.
 843 *
 844 * Number 1 above might fail if you don't have a full bset, but I think
 845 * that is interesting information to audit.
 846 *
 847 * A number of other conditions require logging:
 848 * 2) something prevented setuid root getting all caps
 849 * 3) non-setuid root gets fcaps
 850 * 4) non-setuid root gets ambient
 851 */
 852static inline bool nonroot_raised_pE(struct cred *new, const struct cred *old,
 853                                     kuid_t root, bool has_fcap)
 854{
 855        bool ret = false;
 856
 857        if ((__cap_grew(effective, ambient, new) &&
 858             !(__cap_full(effective, new) &&
 859               (__is_eff(root, new) || __is_real(root, new)) &&
 860               root_privileged())) ||
 861            (root_privileged() &&
 862             __is_suid(root, new) &&
 863             !__cap_full(effective, new)) ||
 864            (!__is_setuid(new, old) &&
 865             ((has_fcap &&
 866               __cap_gained(permitted, new, old)) ||
 867              __cap_gained(ambient, new, old))))
 868
 869                ret = true;
 870
 871        return ret;
 872}
 873
 874/**
 875 * cap_bprm_creds_from_file - Set up the proposed credentials for execve().
 876 * @bprm: The execution parameters, including the proposed creds
 877 * @file: The file to pull the credentials from
 878 *
 879 * Set up the proposed credentials for a new execution context being
 880 * constructed by execve().  The proposed creds in @bprm->cred is altered,
 881 * which won't take effect immediately.
 882 *
 883 * Return: 0 if successful, -ve on error.
 884 */
 885int cap_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file)
 886{
 887        /* Process setpcap binaries and capabilities for uid 0 */
 888        const struct cred *old = current_cred();
 889        struct cred *new = bprm->cred;
 890        bool effective = false, has_fcap = false, is_setid;
 891        int ret;
 892        kuid_t root_uid;
 893
 894        if (WARN_ON(!cap_ambient_invariant_ok(old)))
 895                return -EPERM;
 896
 897        ret = get_file_caps(bprm, file, &effective, &has_fcap);
 898        if (ret < 0)
 899                return ret;
 900
 901        root_uid = make_kuid(new->user_ns, 0);
 902
 903        handle_privileged_root(bprm, has_fcap, &effective, root_uid);
 904
 905        /* if we have fs caps, clear dangerous personality flags */
 906        if (__cap_gained(permitted, new, old))
 907                bprm->per_clear |= PER_CLEAR_ON_SETID;
 908
 909        /* Don't let someone trace a set[ug]id/setpcap binary with the revised
 910         * credentials unless they have the appropriate permit.
 911         *
 912         * In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
 913         */
 914        is_setid = __is_setuid(new, old) || __is_setgid(new, old);
 915
 916        if ((is_setid || __cap_gained(permitted, new, old)) &&
 917            ((bprm->unsafe & ~LSM_UNSAFE_PTRACE) ||
 918             !ptracer_capable(current, new->user_ns))) {
 919                /* downgrade; they get no more than they had, and maybe less */
 920                if (!ns_capable(new->user_ns, CAP_SETUID) ||
 921                    (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) {
 922                        new->euid = new->uid;
 923                        new->egid = new->gid;
 924                }
 925                new->cap_permitted = cap_intersect(new->cap_permitted,
 926                                                   old->cap_permitted);
 927        }
 928
 929        new->suid = new->fsuid = new->euid;
 930        new->sgid = new->fsgid = new->egid;
 931
 932        /* File caps or setid cancels ambient. */
 933        if (has_fcap || is_setid)
 934                cap_clear(new->cap_ambient);
 935
 936        /*
 937         * Now that we've computed pA', update pP' to give:
 938         *   pP' = (X & fP) | (pI & fI) | pA'
 939         */
 940        new->cap_permitted = cap_combine(new->cap_permitted, new->cap_ambient);
 941
 942        /*
 943         * Set pE' = (fE ? pP' : pA').  Because pA' is zero if fE is set,
 944         * this is the same as pE' = (fE ? pP' : 0) | pA'.
 945         */
 946        if (effective)
 947                new->cap_effective = new->cap_permitted;
 948        else
 949                new->cap_effective = new->cap_ambient;
 950
 951        if (WARN_ON(!cap_ambient_invariant_ok(new)))
 952                return -EPERM;
 953
 954        if (nonroot_raised_pE(new, old, root_uid, has_fcap)) {
 955                ret = audit_log_bprm_fcaps(bprm, new, old);
 956                if (ret < 0)
 957                        return ret;
 958        }
 959
 960        new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
 961
 962        if (WARN_ON(!cap_ambient_invariant_ok(new)))
 963                return -EPERM;
 964
 965        /* Check for privilege-elevated exec. */
 966        if (is_setid ||
 967            (!__is_real(root_uid, new) &&
 968             (effective ||
 969              __cap_grew(permitted, ambient, new))))
 970                bprm->secureexec = 1;
 971
 972        return 0;
 973}
 974
 975/**
 976 * cap_inode_setxattr - Determine whether an xattr may be altered
 977 * @dentry: The inode/dentry being altered
 978 * @name: The name of the xattr to be changed
 979 * @value: The value that the xattr will be changed to
 980 * @size: The size of value
 981 * @flags: The replacement flag
 982 *
 983 * Determine whether an xattr may be altered or set on an inode, returning 0 if
 984 * permission is granted, -ve if denied.
 985 *
 986 * This is used to make sure security xattrs don't get updated or set by those
 987 * who aren't privileged to do so.
 988 */
 989int cap_inode_setxattr(struct dentry *dentry, const char *name,
 990                       const void *value, size_t size, int flags)
 991{
 992        struct user_namespace *user_ns = dentry->d_sb->s_user_ns;
 993
 994        /* Ignore non-security xattrs */
 995        if (strncmp(name, XATTR_SECURITY_PREFIX,
 996                        XATTR_SECURITY_PREFIX_LEN) != 0)
 997                return 0;
 998
 999        /*
1000         * For XATTR_NAME_CAPS the check will be done in
1001         * cap_convert_nscap(), called by setxattr()
1002         */
1003        if (strcmp(name, XATTR_NAME_CAPS) == 0)
1004                return 0;
1005
1006        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
1007                return -EPERM;
1008        return 0;
1009}
1010
1011/**
1012 * cap_inode_removexattr - Determine whether an xattr may be removed
1013 *
1014 * @idmap:      idmap of the mount the inode was found from
1015 * @dentry:     The inode/dentry being altered
1016 * @name:       The name of the xattr to be changed
1017 *
1018 * Determine whether an xattr may be removed from an inode, returning 0 if
1019 * permission is granted, -ve if denied.
1020 *
1021 * If the inode has been found through an idmapped mount the idmap of
1022 * the vfsmount must be passed through @idmap. This function will then
1023 * take care to map the inode according to @idmap before checking
1024 * permissions. On non-idmapped mounts or if permission checking is to be
1025 * performed on the raw inode simply pass @nop_mnt_idmap.
1026 *
1027 * This is used to make sure security xattrs don't get removed by those who
1028 * aren't privileged to remove them.
1029 */
1030int cap_inode_removexattr(struct mnt_idmap *idmap,
1031                          struct dentry *dentry, const char *name)
1032{
1033        struct user_namespace *user_ns = dentry->d_sb->s_user_ns;
1034
1035        /* Ignore non-security xattrs */
1036        if (strncmp(name, XATTR_SECURITY_PREFIX,
1037                        XATTR_SECURITY_PREFIX_LEN) != 0)
1038                return 0;
1039
1040        if (strcmp(name, XATTR_NAME_CAPS) == 0) {
1041                /* security.capability gets namespaced */
1042                struct inode *inode = d_backing_inode(dentry);
1043                if (!inode)
1044                        return -EINVAL;
1045                if (!capable_wrt_inode_uidgid(idmap, inode, CAP_SETFCAP))
1046                        return -EPERM;
1047                return 0;
1048        }
1049
1050        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
1051                return -EPERM;
1052        return 0;
1053}
1054
1055/*
1056 * cap_emulate_setxuid() fixes the effective / permitted capabilities of
1057 * a process after a call to setuid, setreuid, or setresuid.
1058 *
1059 *  1) When set*uiding _from_ one of {r,e,s}uid == 0 _to_ all of
1060 *  {r,e,s}uid != 0, the permitted and effective capabilities are
1061 *  cleared.
1062 *
1063 *  2) When set*uiding _from_ euid == 0 _to_ euid != 0, the effective
1064 *  capabilities of the process are cleared.
1065 *
1066 *  3) When set*uiding _from_ euid != 0 _to_ euid == 0, the effective
1067 *  capabilities are set to the permitted capabilities.
1068 *
1069 *  fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should
1070 *  never happen.
1071 *
1072 *  -astor
1073 *
1074 * cevans - New behaviour, Oct '99
1075 * A process may, via prctl(), elect to keep its capabilities when it
1076 * calls setuid() and switches away from uid==0. Both permitted and
1077 * effective sets will be retained.
1078 * Without this change, it was impossible for a daemon to drop only some
1079 * of its privilege. The call to setuid(!=0) would drop all privileges!
1080 * Keeping uid 0 is not an option because uid 0 owns too many vital
1081 * files..
1082 * Thanks to Olaf Kirch and Peter Benie for spotting this.
1083 */
1084static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old)
1085{
1086        kuid_t root_uid = make_kuid(old->user_ns, 0);
1087
1088        if ((uid_eq(old->uid, root_uid) ||
1089             uid_eq(old->euid, root_uid) ||
1090             uid_eq(old->suid, root_uid)) &&
1091            (!uid_eq(new->uid, root_uid) &&
1092             !uid_eq(new->euid, root_uid) &&
1093             !uid_eq(new->suid, root_uid))) {
1094                if (!issecure(SECURE_KEEP_CAPS)) {
1095                        cap_clear(new->cap_permitted);
1096                        cap_clear(new->cap_effective);
1097                }
1098
1099                /*
1100                 * Pre-ambient programs expect setresuid to nonroot followed
1101                 * by exec to drop capabilities.  We should make sure that
1102                 * this remains the case.
1103                 */
1104                cap_clear(new->cap_ambient);
1105        }
1106        if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid))
1107                cap_clear(new->cap_effective);
1108        if (!uid_eq(old->euid, root_uid) && uid_eq(new->euid, root_uid))
1109                new->cap_effective = new->cap_permitted;
1110}
1111
1112/**
1113 * cap_task_fix_setuid - Fix up the results of setuid() call
1114 * @new: The proposed credentials
1115 * @old: The current task's current credentials
1116 * @flags: Indications of what has changed
1117 *
1118 * Fix up the results of setuid() call before the credential changes are
1119 * actually applied.
1120 *
1121 * Return: 0 to grant the changes, -ve to deny them.
1122 */
1123int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags)
1124{
1125        switch (flags) {
1126        case LSM_SETID_RE:
1127        case LSM_SETID_ID:
1128        case LSM_SETID_RES:
1129                /* juggle the capabilities to follow [RES]UID changes unless
1130                 * otherwise suppressed */
1131                if (!issecure(SECURE_NO_SETUID_FIXUP))
1132                        cap_emulate_setxuid(new, old);
1133                break;
1134
1135        case LSM_SETID_FS:
1136                /* juggle the capabilities to follow FSUID changes, unless
1137                 * otherwise suppressed
1138                 *
1139                 * FIXME - is fsuser used for all CAP_FS_MASK capabilities?
1140                 *          if not, we might be a bit too harsh here.
1141                 */
1142                if (!issecure(SECURE_NO_SETUID_FIXUP)) {
1143                        kuid_t root_uid = make_kuid(old->user_ns, 0);
1144                        if (uid_eq(old->fsuid, root_uid) && !uid_eq(new->fsuid, root_uid))
1145                                new->cap_effective =
1146                                        cap_drop_fs_set(new->cap_effective);
1147
1148                        if (!uid_eq(old->fsuid, root_uid) && uid_eq(new->fsuid, root_uid))
1149                                new->cap_effective =
1150                                        cap_raise_fs_set(new->cap_effective,
1151                                                         new->cap_permitted);
1152                }
1153                break;
1154
1155        default:
1156                return -EINVAL;
1157        }
1158
1159        return 0;
1160}
1161
1162/*
1163 * Rationale: code calling task_setscheduler, task_setioprio, and
1164 * task_setnice, assumes that
1165 *   . if capable(cap_sys_nice), then those actions should be allowed
1166 *   . if not capable(cap_sys_nice), but acting on your own processes,
1167 *      then those actions should be allowed
1168 * This is insufficient now since you can call code without suid, but
1169 * yet with increased caps.
1170 * So we check for increased caps on the target process.
1171 */
1172static int cap_safe_nice(struct task_struct *p)
1173{
1174        int is_subset, ret = 0;
1175
1176        rcu_read_lock();
1177        is_subset = cap_issubset(__task_cred(p)->cap_permitted,
1178                                 current_cred()->cap_permitted);
1179        if (!is_subset && !ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
1180                ret = -EPERM;
1181        rcu_read_unlock();
1182
1183        return ret;
1184}
1185
1186/**
1187 * cap_task_setscheduler - Determine if scheduler policy change is permitted
1188 * @p: The task to affect
1189 *
1190 * Determine if the requested scheduler policy change is permitted for the
1191 * specified task.
1192 *
1193 * Return: 0 if permission is granted, -ve if denied.
1194 */
1195int cap_task_setscheduler(struct task_struct *p)
1196{
1197        return cap_safe_nice(p);
1198}
1199
1200/**
1201 * cap_task_setioprio - Determine if I/O priority change is permitted
1202 * @p: The task to affect
1203 * @ioprio: The I/O priority to set
1204 *
1205 * Determine if the requested I/O priority change is permitted for the specified
1206 * task.
1207 *
1208 * Return: 0 if permission is granted, -ve if denied.
1209 */
1210int cap_task_setioprio(struct task_struct *p, int ioprio)
1211{
1212        return cap_safe_nice(p);
1213}
1214
1215/**
1216 * cap_task_setnice - Determine if task priority change is permitted
1217 * @p: The task to affect
1218 * @nice: The nice value to set
1219 *
1220 * Determine if the requested task priority change is permitted for the
1221 * specified task.
1222 *
1223 * Return: 0 if permission is granted, -ve if denied.
1224 */
1225int cap_task_setnice(struct task_struct *p, int nice)
1226{
1227        return cap_safe_nice(p);
1228}
1229
1230/*
1231 * Implement PR_CAPBSET_DROP.  Attempt to remove the specified capability from
1232 * the current task's bounding set.  Returns 0 on success, -ve on error.
1233 */
1234static int cap_prctl_drop(unsigned long cap)
1235{
1236        struct cred *new;
1237
1238        if (!ns_capable(current_user_ns(), CAP_SETPCAP))
1239                return -EPERM;
1240        if (!cap_valid(cap))
1241                return -EINVAL;
1242
1243        new = prepare_creds();
1244        if (!new)
1245                return -ENOMEM;
1246        cap_lower(new->cap_bset, cap);
1247        return commit_creds(new);
1248}
1249
1250/**
1251 * cap_task_prctl - Implement process control functions for this security module
1252 * @option: The process control function requested
1253 * @arg2: The argument data for this function
1254 * @arg3: The argument data for this function
1255 * @arg4: The argument data for this function
1256 * @arg5: The argument data for this function
1257 *
1258 * Allow process control functions (sys_prctl()) to alter capabilities; may
1259 * also deny access to other functions not otherwise implemented here.
1260 *
1261 * Return: 0 or +ve on success, -ENOSYS if this function is not implemented
1262 * here, other -ve on error.  If -ENOSYS is returned, sys_prctl() and other LSM
1263 * modules will consider performing the function.
1264 */
1265int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
1266                   unsigned long arg4, unsigned long arg5)
1267{
1268        const struct cred *old = current_cred();
1269        struct cred *new;
1270
1271        switch (option) {
1272        case PR_CAPBSET_READ:
1273                if (!cap_valid(arg2))
1274                        return -EINVAL;
1275                return !!cap_raised(old->cap_bset, arg2);
1276
1277        case PR_CAPBSET_DROP:
1278                return cap_prctl_drop(arg2);
1279
1280        /*
1281         * The next four prctl's remain to assist with transitioning a
1282         * system from legacy UID=0 based privilege (when filesystem
1283         * capabilities are not in use) to a system using filesystem
1284         * capabilities only - as the POSIX.1e draft intended.
1285         *
1286         * Note:
1287         *
1288         *  PR_SET_SECUREBITS =
1289         *      issecure_mask(SECURE_KEEP_CAPS_LOCKED)
1290         *    | issecure_mask(SECURE_NOROOT)
1291         *    | issecure_mask(SECURE_NOROOT_LOCKED)
1292         *    | issecure_mask(SECURE_NO_SETUID_FIXUP)
1293         *    | issecure_mask(SECURE_NO_SETUID_FIXUP_LOCKED)
1294         *
1295         * will ensure that the current process and all of its
1296         * children will be locked into a pure
1297         * capability-based-privilege environment.
1298         */
1299        case PR_SET_SECUREBITS:
1300                if ((((old->securebits & SECURE_ALL_LOCKS) >> 1)
1301                     & (old->securebits ^ arg2))                        /*[1]*/
1302                    || ((old->securebits & SECURE_ALL_LOCKS & ~arg2))   /*[2]*/
1303                    || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS))   /*[3]*/
1304                    || (cap_capable(current_cred(),
1305                                    current_cred()->user_ns,
1306                                    CAP_SETPCAP,
1307                                    CAP_OPT_NONE) != 0)                 /*[4]*/
1308                        /*
1309                         * [1] no changing of bits that are locked
1310                         * [2] no unlocking of locks
1311                         * [3] no setting of unsupported bits
1312                         * [4] doing anything requires privilege (go read about
1313                         *     the "sendmail capabilities bug")
1314                         */
1315                    )
1316                        /* cannot change a locked bit */
1317                        return -EPERM;
1318
1319                new = prepare_creds();
1320                if (!new)
1321                        return -ENOMEM;
1322                new->securebits = arg2;
1323                return commit_creds(new);
1324
1325        case PR_GET_SECUREBITS:
1326                return old->securebits;
1327
1328        case PR_GET_KEEPCAPS:
1329                return !!issecure(SECURE_KEEP_CAPS);
1330
1331        case PR_SET_KEEPCAPS:
1332                if (arg2 > 1) /* Note, we rely on arg2 being unsigned here */
1333                        return -EINVAL;
1334                if (issecure(SECURE_KEEP_CAPS_LOCKED))
1335                        return -EPERM;
1336
1337                new = prepare_creds();
1338                if (!new)
1339                        return -ENOMEM;
1340                if (arg2)
1341                        new->securebits |= issecure_mask(SECURE_KEEP_CAPS);
1342                else
1343                        new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
1344                return commit_creds(new);
1345
1346        case PR_CAP_AMBIENT:
1347                if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) {
1348                        if (arg3 | arg4 | arg5)
1349                                return -EINVAL;
1350
1351                        new = prepare_creds();
1352                        if (!new)
1353                                return -ENOMEM;
1354                        cap_clear(new->cap_ambient);
1355                        return commit_creds(new);
1356                }
1357
1358                if (((!cap_valid(arg3)) | arg4 | arg5))
1359                        return -EINVAL;
1360
1361                if (arg2 == PR_CAP_AMBIENT_IS_SET) {
1362                        return !!cap_raised(current_cred()->cap_ambient, arg3);
1363                } else if (arg2 != PR_CAP_AMBIENT_RAISE &&
1364                           arg2 != PR_CAP_AMBIENT_LOWER) {
1365                        return -EINVAL;
1366                } else {
1367                        if (arg2 == PR_CAP_AMBIENT_RAISE &&
1368                            (!cap_raised(current_cred()->cap_permitted, arg3) ||
1369                             !cap_raised(current_cred()->cap_inheritable,
1370                                         arg3) ||
1371                             issecure(SECURE_NO_CAP_AMBIENT_RAISE)))
1372                                return -EPERM;
1373
1374                        new = prepare_creds();
1375                        if (!new)
1376                                return -ENOMEM;
1377                        if (arg2 == PR_CAP_AMBIENT_RAISE)
1378                                cap_raise(new->cap_ambient, arg3);
1379                        else
1380                                cap_lower(new->cap_ambient, arg3);
1381                        return commit_creds(new);
1382                }
1383
1384        default:
1385                /* No functionality available - continue with default */
1386                return -ENOSYS;
1387        }
1388}
1389
1390/**
1391 * cap_vm_enough_memory - Determine whether a new virtual mapping is permitted
1392 * @mm: The VM space in which the new mapping is to be made
1393 * @pages: The size of the mapping
1394 *
1395 * Determine whether the allocation of a new virtual mapping by the current
1396 * task is permitted.
1397 *
1398 * Return: 1 if permission is granted, 0 if not.
1399 */
1400int cap_vm_enough_memory(struct mm_struct *mm, long pages)
1401{
1402        int cap_sys_admin = 0;
1403
1404        if (cap_capable(current_cred(), &init_user_ns,
1405                                CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) == 0)
1406                cap_sys_admin = 1;
1407
1408        return cap_sys_admin;
1409}
1410
1411/**
1412 * cap_mmap_addr - check if able to map given addr
1413 * @addr: address attempting to be mapped
1414 *
1415 * If the process is attempting to map memory below dac_mmap_min_addr they need
1416 * CAP_SYS_RAWIO.  The other parameters to this function are unused by the
1417 * capability security module.
1418 *
1419 * Return: 0 if this mapping should be allowed or -EPERM if not.
1420 */
1421int cap_mmap_addr(unsigned long addr)
1422{
1423        int ret = 0;
1424
1425        if (addr < dac_mmap_min_addr) {
1426                ret = cap_capable(current_cred(), &init_user_ns, CAP_SYS_RAWIO,
1427                                  CAP_OPT_NONE);
1428                /* set PF_SUPERPRIV if it turns out we allow the low mmap */
1429                if (ret == 0)
1430                        current->flags |= PF_SUPERPRIV;
1431        }
1432        return ret;
1433}
1434
1435int cap_mmap_file(struct file *file, unsigned long reqprot,
1436                  unsigned long prot, unsigned long flags)
1437{
1438        return 0;
1439}
1440
1441#ifdef CONFIG_SECURITY
1442
1443static struct security_hook_list capability_hooks[] __ro_after_init = {
1444        LSM_HOOK_INIT(capable, cap_capable),
1445        LSM_HOOK_INIT(settime, cap_settime),
1446        LSM_HOOK_INIT(ptrace_access_check, cap_ptrace_access_check),
1447        LSM_HOOK_INIT(ptrace_traceme, cap_ptrace_traceme),
1448        LSM_HOOK_INIT(capget, cap_capget),
1449        LSM_HOOK_INIT(capset, cap_capset),
1450        LSM_HOOK_INIT(bprm_creds_from_file, cap_bprm_creds_from_file),
1451        LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv),
1452        LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv),
1453        LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity),
1454        LSM_HOOK_INIT(mmap_addr, cap_mmap_addr),
1455        LSM_HOOK_INIT(mmap_file, cap_mmap_file),
1456        LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid),
1457        LSM_HOOK_INIT(task_prctl, cap_task_prctl),
1458        LSM_HOOK_INIT(task_setscheduler, cap_task_setscheduler),
1459        LSM_HOOK_INIT(task_setioprio, cap_task_setioprio),
1460        LSM_HOOK_INIT(task_setnice, cap_task_setnice),
1461        LSM_HOOK_INIT(vm_enough_memory, cap_vm_enough_memory),
1462};
1463
1464static int __init capability_init(void)
1465{
1466        security_add_hooks(capability_hooks, ARRAY_SIZE(capability_hooks),
1467                                "capability");
1468        return 0;
1469}
1470
1471DEFINE_LSM(capability) = {
1472        .name = "capability",
1473        .order = LSM_ORDER_FIRST,
1474        .init = capability_init,
1475};
1476
1477#endif /* CONFIG_SECURITY */
1478