linux/kernel/taskstats.c
<<
>>
Prefs
   1/*
   2 * taskstats.c - Export per-task statistics to userland
   3 *
   4 * Copyright (C) Shailabh Nagar, IBM Corp. 2006
   5 *           (C) Balbir Singh,   IBM Corp. 2006
   6 *
   7 * This program is free software; you can redistribute it and/or modify
   8 * it under the terms of the GNU General Public License as published by
   9 * the Free Software Foundation; either version 2 of the License, or
  10 * (at your option) any later version.
  11 *
  12 * This program is distributed in the hope that it will be useful,
  13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 * GNU General Public License for more details.
  16 *
  17 */
  18
  19#include <linux/kernel.h>
  20#include <linux/taskstats_kern.h>
  21#include <linux/tsacct_kern.h>
  22#include <linux/delayacct.h>
  23#include <linux/cpumask.h>
  24#include <linux/percpu.h>
  25#include <linux/slab.h>
  26#include <linux/cgroupstats.h>
  27#include <linux/cgroup.h>
  28#include <linux/fs.h>
  29#include <linux/file.h>
  30#include <net/genetlink.h>
  31#include <asm/atomic.h>
  32
  33/*
  34 * Maximum length of a cpumask that can be specified in
  35 * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute
  36 */
  37#define TASKSTATS_CPUMASK_MAXLEN        (100+6*NR_CPUS)
  38
  39static DEFINE_PER_CPU(__u32, taskstats_seqnum);
  40static int family_registered;
  41struct kmem_cache *taskstats_cache;
  42
  43static struct genl_family family = {
  44        .id             = GENL_ID_GENERATE,
  45        .name           = TASKSTATS_GENL_NAME,
  46        .version        = TASKSTATS_GENL_VERSION,
  47        .maxattr        = TASKSTATS_CMD_ATTR_MAX,
  48};
  49
  50static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
  51        [TASKSTATS_CMD_ATTR_PID]  = { .type = NLA_U32 },
  52        [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
  53        [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
  54        [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
  55
  56static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = {
  57        [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
  58};
  59
  60struct listener {
  61        struct list_head list;
  62        pid_t pid;
  63        char valid;
  64};
  65
  66struct listener_list {
  67        struct rw_semaphore sem;
  68        struct list_head list;
  69};
  70static DEFINE_PER_CPU(struct listener_list, listener_array);
  71
  72enum actions {
  73        REGISTER,
  74        DEREGISTER,
  75        CPU_DONT_CARE
  76};
  77
  78static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
  79                                size_t size)
  80{
  81        struct sk_buff *skb;
  82        void *reply;
  83
  84        /*
  85         * If new attributes are added, please revisit this allocation
  86         */
  87        skb = genlmsg_new(size, GFP_KERNEL);
  88        if (!skb)
  89                return -ENOMEM;
  90
  91        if (!info) {
  92                int seq = this_cpu_inc_return(taskstats_seqnum) - 1;
  93
  94                reply = genlmsg_put(skb, 0, seq, &family, 0, cmd);
  95        } else
  96                reply = genlmsg_put_reply(skb, info, &family, 0, cmd);
  97        if (reply == NULL) {
  98                nlmsg_free(skb);
  99                return -EINVAL;
 100        }
 101
 102        *skbp = skb;
 103        return 0;
 104}
 105
 106/*
 107 * Send taskstats data in @skb to listener with nl_pid @pid
 108 */
 109static int send_reply(struct sk_buff *skb, struct genl_info *info)
 110{
 111        struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
 112        void *reply = genlmsg_data(genlhdr);
 113        int rc;
 114
 115        rc = genlmsg_end(skb, reply);
 116        if (rc < 0) {
 117                nlmsg_free(skb);
 118                return rc;
 119        }
 120
 121        return genlmsg_reply(skb, info);
 122}
 123
 124/*
 125 * Send taskstats data in @skb to listeners registered for @cpu's exit data
 126 */
 127static void send_cpu_listeners(struct sk_buff *skb,
 128                                        struct listener_list *listeners)
 129{
 130        struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
 131        struct listener *s, *tmp;
 132        struct sk_buff *skb_next, *skb_cur = skb;
 133        void *reply = genlmsg_data(genlhdr);
 134        int rc, delcount = 0;
 135
 136        rc = genlmsg_end(skb, reply);
 137        if (rc < 0) {
 138                nlmsg_free(skb);
 139                return;
 140        }
 141
 142        rc = 0;
 143        down_read(&listeners->sem);
 144        list_for_each_entry(s, &listeners->list, list) {
 145                skb_next = NULL;
 146                if (!list_is_last(&s->list, &listeners->list)) {
 147                        skb_next = skb_clone(skb_cur, GFP_KERNEL);
 148                        if (!skb_next)
 149                                break;
 150                }
 151                rc = genlmsg_unicast(&init_net, skb_cur, s->pid);
 152                if (rc == -ECONNREFUSED) {
 153                        s->valid = 0;
 154                        delcount++;
 155                }
 156                skb_cur = skb_next;
 157        }
 158        up_read(&listeners->sem);
 159
 160        if (skb_cur)
 161                nlmsg_free(skb_cur);
 162
 163        if (!delcount)
 164                return;
 165
 166        /* Delete invalidated entries */
 167        down_write(&listeners->sem);
 168        list_for_each_entry_safe(s, tmp, &listeners->list, list) {
 169                if (!s->valid) {
 170                        list_del(&s->list);
 171                        kfree(s);
 172                }
 173        }
 174        up_write(&listeners->sem);
 175}
 176
 177static void fill_stats(struct task_struct *tsk, struct taskstats *stats)
 178{
 179        memset(stats, 0, sizeof(*stats));
 180        /*
 181         * Each accounting subsystem adds calls to its functions to
 182         * fill in relevant parts of struct taskstsats as follows
 183         *
 184         *      per-task-foo(stats, tsk);
 185         */
 186
 187        delayacct_add_tsk(stats, tsk);
 188
 189        /* fill in basic acct fields */
 190        stats->version = TASKSTATS_VERSION;
 191        stats->nvcsw = tsk->nvcsw;
 192        stats->nivcsw = tsk->nivcsw;
 193        bacct_add_tsk(stats, tsk);
 194
 195        /* fill in extended acct fields */
 196        xacct_add_tsk(stats, tsk);
 197}
 198
 199static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
 200{
 201        struct task_struct *tsk;
 202
 203        rcu_read_lock();
 204        tsk = find_task_by_vpid(pid);
 205        if (tsk)
 206                get_task_struct(tsk);
 207        rcu_read_unlock();
 208        if (!tsk)
 209                return -ESRCH;
 210        fill_stats(tsk, stats);
 211        put_task_struct(tsk);
 212        return 0;
 213}
 214
 215static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
 216{
 217        struct task_struct *tsk, *first;
 218        unsigned long flags;
 219        int rc = -ESRCH;
 220
 221        /*
 222         * Add additional stats from live tasks except zombie thread group
 223         * leaders who are already counted with the dead tasks
 224         */
 225        rcu_read_lock();
 226        first = find_task_by_vpid(tgid);
 227
 228        if (!first || !lock_task_sighand(first, &flags))
 229                goto out;
 230
 231        if (first->signal->stats)
 232                memcpy(stats, first->signal->stats, sizeof(*stats));
 233        else
 234                memset(stats, 0, sizeof(*stats));
 235
 236        tsk = first;
 237        do {
 238                if (tsk->exit_state)
 239                        continue;
 240                /*
 241                 * Accounting subsystem can call its functions here to
 242                 * fill in relevant parts of struct taskstsats as follows
 243                 *
 244                 *      per-task-foo(stats, tsk);
 245                 */
 246                delayacct_add_tsk(stats, tsk);
 247
 248                stats->nvcsw += tsk->nvcsw;
 249                stats->nivcsw += tsk->nivcsw;
 250        } while_each_thread(first, tsk);
 251
 252        unlock_task_sighand(first, &flags);
 253        rc = 0;
 254out:
 255        rcu_read_unlock();
 256
 257        stats->version = TASKSTATS_VERSION;
 258        /*
 259         * Accounting subsystems can also add calls here to modify
 260         * fields of taskstats.
 261         */
 262        return rc;
 263}
 264
 265static void fill_tgid_exit(struct task_struct *tsk)
 266{
 267        unsigned long flags;
 268
 269        spin_lock_irqsave(&tsk->sighand->siglock, flags);
 270        if (!tsk->signal->stats)
 271                goto ret;
 272
 273        /*
 274         * Each accounting subsystem calls its functions here to
 275         * accumalate its per-task stats for tsk, into the per-tgid structure
 276         *
 277         *      per-task-foo(tsk->signal->stats, tsk);
 278         */
 279        delayacct_add_tsk(tsk->signal->stats, tsk);
 280ret:
 281        spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
 282        return;
 283}
 284
 285static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
 286{
 287        struct listener_list *listeners;
 288        struct listener *s, *tmp, *s2;
 289        unsigned int cpu;
 290
 291        if (!cpumask_subset(mask, cpu_possible_mask))
 292                return -EINVAL;
 293
 294        s = NULL;
 295        if (isadd == REGISTER) {
 296                for_each_cpu(cpu, mask) {
 297                        if (!s)
 298                                s = kmalloc_node(sizeof(struct listener),
 299                                                 GFP_KERNEL, cpu_to_node(cpu));
 300                        if (!s)
 301                                goto cleanup;
 302                        s->pid = pid;
 303                        INIT_LIST_HEAD(&s->list);
 304                        s->valid = 1;
 305
 306                        listeners = &per_cpu(listener_array, cpu);
 307                        down_write(&listeners->sem);
 308                        list_for_each_entry_safe(s2, tmp, &listeners->list, list) {
 309                                if (s2->pid == pid)
 310                                        goto next_cpu;
 311                        }
 312                        list_add(&s->list, &listeners->list);
 313                        s = NULL;
 314next_cpu:
 315                        up_write(&listeners->sem);
 316                }
 317                kfree(s);
 318                return 0;
 319        }
 320
 321        /* Deregister or cleanup */
 322cleanup:
 323        for_each_cpu(cpu, mask) {
 324                listeners = &per_cpu(listener_array, cpu);
 325                down_write(&listeners->sem);
 326                list_for_each_entry_safe(s, tmp, &listeners->list, list) {
 327                        if (s->pid == pid) {
 328                                list_del(&s->list);
 329                                kfree(s);
 330                                break;
 331                        }
 332                }
 333                up_write(&listeners->sem);
 334        }
 335        return 0;
 336}
 337
 338static int parse(struct nlattr *na, struct cpumask *mask)
 339{
 340        char *data;
 341        int len;
 342        int ret;
 343
 344        if (na == NULL)
 345                return 1;
 346        len = nla_len(na);
 347        if (len > TASKSTATS_CPUMASK_MAXLEN)
 348                return -E2BIG;
 349        if (len < 1)
 350                return -EINVAL;
 351        data = kmalloc(len, GFP_KERNEL);
 352        if (!data)
 353                return -ENOMEM;
 354        nla_strlcpy(data, na, len);
 355        ret = cpulist_parse(data, mask);
 356        kfree(data);
 357        return ret;
 358}
 359
 360#if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
 361#define TASKSTATS_NEEDS_PADDING 1
 362#endif
 363
 364static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
 365{
 366        struct nlattr *na, *ret;
 367        int aggr;
 368
 369        aggr = (type == TASKSTATS_TYPE_PID)
 370                        ? TASKSTATS_TYPE_AGGR_PID
 371                        : TASKSTATS_TYPE_AGGR_TGID;
 372
 373        /*
 374         * The taskstats structure is internally aligned on 8 byte
 375         * boundaries but the layout of the aggregrate reply, with
 376         * two NLA headers and the pid (each 4 bytes), actually
 377         * force the entire structure to be unaligned. This causes
 378         * the kernel to issue unaligned access warnings on some
 379         * architectures like ia64. Unfortunately, some software out there
 380         * doesn't properly unroll the NLA packet and assumes that the start
 381         * of the taskstats structure will always be 20 bytes from the start
 382         * of the netlink payload. Aligning the start of the taskstats
 383         * structure breaks this software, which we don't want. So, for now
 384         * the alignment only happens on architectures that require it
 385         * and those users will have to update to fixed versions of those
 386         * packages. Space is reserved in the packet only when needed.
 387         * This ifdef should be removed in several years e.g. 2012 once
 388         * we can be confident that fixed versions are installed on most
 389         * systems. We add the padding before the aggregate since the
 390         * aggregate is already a defined type.
 391         */
 392#ifdef TASKSTATS_NEEDS_PADDING
 393        if (nla_put(skb, TASKSTATS_TYPE_NULL, 0, NULL) < 0)
 394                goto err;
 395#endif
 396        na = nla_nest_start(skb, aggr);
 397        if (!na)
 398                goto err;
 399
 400        if (nla_put(skb, type, sizeof(pid), &pid) < 0)
 401                goto err;
 402        ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
 403        if (!ret)
 404                goto err;
 405        nla_nest_end(skb, na);
 406
 407        return nla_data(ret);
 408err:
 409        return NULL;
 410}
 411
 412static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 413{
 414        int rc = 0;
 415        struct sk_buff *rep_skb;
 416        struct cgroupstats *stats;
 417        struct nlattr *na;
 418        size_t size;
 419        u32 fd;
 420        struct file *file;
 421        int fput_needed;
 422
 423        na = info->attrs[CGROUPSTATS_CMD_ATTR_FD];
 424        if (!na)
 425                return -EINVAL;
 426
 427        fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]);
 428        file = fget_light(fd, &fput_needed);
 429        if (!file)
 430                return 0;
 431
 432        size = nla_total_size(sizeof(struct cgroupstats));
 433
 434        rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb,
 435                                size);
 436        if (rc < 0)
 437                goto err;
 438
 439        na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS,
 440                                sizeof(struct cgroupstats));
 441        stats = nla_data(na);
 442        memset(stats, 0, sizeof(*stats));
 443
 444        rc = cgroupstats_build(stats, file->f_dentry);
 445        if (rc < 0) {
 446                nlmsg_free(rep_skb);
 447                goto err;
 448        }
 449
 450        rc = send_reply(rep_skb, info);
 451
 452err:
 453        fput_light(file, fput_needed);
 454        return rc;
 455}
 456
 457static int cmd_attr_register_cpumask(struct genl_info *info)
 458{
 459        cpumask_var_t mask;
 460        int rc;
 461
 462        if (!alloc_cpumask_var(&mask, GFP_KERNEL))
 463                return -ENOMEM;
 464        rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask);
 465        if (rc < 0)
 466                goto out;
 467        rc = add_del_listener(info->snd_pid, mask, REGISTER);
 468out:
 469        free_cpumask_var(mask);
 470        return rc;
 471}
 472
 473static int cmd_attr_deregister_cpumask(struct genl_info *info)
 474{
 475        cpumask_var_t mask;
 476        int rc;
 477
 478        if (!alloc_cpumask_var(&mask, GFP_KERNEL))
 479                return -ENOMEM;
 480        rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask);
 481        if (rc < 0)
 482                goto out;
 483        rc = add_del_listener(info->snd_pid, mask, DEREGISTER);
 484out:
 485        free_cpumask_var(mask);
 486        return rc;
 487}
 488
 489static size_t taskstats_packet_size(void)
 490{
 491        size_t size;
 492
 493        size = nla_total_size(sizeof(u32)) +
 494                nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
 495#ifdef TASKSTATS_NEEDS_PADDING
 496        size += nla_total_size(0); /* Padding for alignment */
 497#endif
 498        return size;
 499}
 500
 501static int cmd_attr_pid(struct genl_info *info)
 502{
 503        struct taskstats *stats;
 504        struct sk_buff *rep_skb;
 505        size_t size;
 506        u32 pid;
 507        int rc;
 508
 509        size = taskstats_packet_size();
 510
 511        rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
 512        if (rc < 0)
 513                return rc;
 514
 515        rc = -EINVAL;
 516        pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
 517        stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
 518        if (!stats)
 519                goto err;
 520
 521        rc = fill_stats_for_pid(pid, stats);
 522        if (rc < 0)
 523                goto err;
 524        return send_reply(rep_skb, info);
 525err:
 526        nlmsg_free(rep_skb);
 527        return rc;
 528}
 529
 530static int cmd_attr_tgid(struct genl_info *info)
 531{
 532        struct taskstats *stats;
 533        struct sk_buff *rep_skb;
 534        size_t size;
 535        u32 tgid;
 536        int rc;
 537
 538        size = taskstats_packet_size();
 539
 540        rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
 541        if (rc < 0)
 542                return rc;
 543
 544        rc = -EINVAL;
 545        tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
 546        stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
 547        if (!stats)
 548                goto err;
 549
 550        rc = fill_stats_for_tgid(tgid, stats);
 551        if (rc < 0)
 552                goto err;
 553        return send_reply(rep_skb, info);
 554err:
 555        nlmsg_free(rep_skb);
 556        return rc;
 557}
 558
 559static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 560{
 561        if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK])
 562                return cmd_attr_register_cpumask(info);
 563        else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK])
 564                return cmd_attr_deregister_cpumask(info);
 565        else if (info->attrs[TASKSTATS_CMD_ATTR_PID])
 566                return cmd_attr_pid(info);
 567        else if (info->attrs[TASKSTATS_CMD_ATTR_TGID])
 568                return cmd_attr_tgid(info);
 569        else
 570                return -EINVAL;
 571}
 572
 573static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
 574{
 575        struct signal_struct *sig = tsk->signal;
 576        struct taskstats *stats;
 577
 578        if (sig->stats || thread_group_empty(tsk))
 579                goto ret;
 580
 581        /* No problem if kmem_cache_zalloc() fails */
 582        stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
 583
 584        spin_lock_irq(&tsk->sighand->siglock);
 585        if (!sig->stats) {
 586                sig->stats = stats;
 587                stats = NULL;
 588        }
 589        spin_unlock_irq(&tsk->sighand->siglock);
 590
 591        if (stats)
 592                kmem_cache_free(taskstats_cache, stats);
 593ret:
 594        return sig->stats;
 595}
 596
 597/* Send pid data out on exit */
 598void taskstats_exit(struct task_struct *tsk, int group_dead)
 599{
 600        int rc;
 601        struct listener_list *listeners;
 602        struct taskstats *stats;
 603        struct sk_buff *rep_skb;
 604        size_t size;
 605        int is_thread_group;
 606
 607        if (!family_registered)
 608                return;
 609
 610        /*
 611         * Size includes space for nested attributes
 612         */
 613        size = taskstats_packet_size();
 614
 615        is_thread_group = !!taskstats_tgid_alloc(tsk);
 616        if (is_thread_group) {
 617                /* PID + STATS + TGID + STATS */
 618                size = 2 * size;
 619                /* fill the tsk->signal->stats structure */
 620                fill_tgid_exit(tsk);
 621        }
 622
 623        listeners = __this_cpu_ptr(&listener_array);
 624        if (list_empty(&listeners->list))
 625                return;
 626
 627        rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size);
 628        if (rc < 0)
 629                return;
 630
 631        stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid);
 632        if (!stats)
 633                goto err;
 634
 635        fill_stats(tsk, stats);
 636
 637        /*
 638         * Doesn't matter if tsk is the leader or the last group member leaving
 639         */
 640        if (!is_thread_group || !group_dead)
 641                goto send;
 642
 643        stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid);
 644        if (!stats)
 645                goto err;
 646
 647        memcpy(stats, tsk->signal->stats, sizeof(*stats));
 648
 649send:
 650        send_cpu_listeners(rep_skb, listeners);
 651        return;
 652err:
 653        nlmsg_free(rep_skb);
 654}
 655
 656static struct genl_ops taskstats_ops = {
 657        .cmd            = TASKSTATS_CMD_GET,
 658        .doit           = taskstats_user_cmd,
 659        .policy         = taskstats_cmd_get_policy,
 660};
 661
 662static struct genl_ops cgroupstats_ops = {
 663        .cmd            = CGROUPSTATS_CMD_GET,
 664        .doit           = cgroupstats_user_cmd,
 665        .policy         = cgroupstats_cmd_get_policy,
 666};
 667
 668/* Needed early in initialization */
 669void __init taskstats_init_early(void)
 670{
 671        unsigned int i;
 672
 673        taskstats_cache = KMEM_CACHE(taskstats, SLAB_PANIC);
 674        for_each_possible_cpu(i) {
 675                INIT_LIST_HEAD(&(per_cpu(listener_array, i).list));
 676                init_rwsem(&(per_cpu(listener_array, i).sem));
 677        }
 678}
 679
 680static int __init taskstats_init(void)
 681{
 682        int rc;
 683
 684        rc = genl_register_family(&family);
 685        if (rc)
 686                return rc;
 687
 688        rc = genl_register_ops(&family, &taskstats_ops);
 689        if (rc < 0)
 690                goto err;
 691
 692        rc = genl_register_ops(&family, &cgroupstats_ops);
 693        if (rc < 0)
 694                goto err_cgroup_ops;
 695
 696        family_registered = 1;
 697        pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
 698        return 0;
 699err_cgroup_ops:
 700        genl_unregister_ops(&family, &taskstats_ops);
 701err:
 702        genl_unregister_family(&family);
 703        return rc;
 704}
 705
 706/*
 707 * late initcall ensures initialization of statistics collection
 708 * mechanisms precedes initialization of the taskstats interface
 709 */
 710late_initcall(taskstats_init);
 711