linux/net/core/drop_monitor.c
<<
>>
Prefs
   1/*
   2 * Monitoring code for network dropped packet alerts
   3 *
   4 * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com>
   5 */
   6
   7#include <linux/netdevice.h>
   8#include <linux/etherdevice.h>
   9#include <linux/string.h>
  10#include <linux/if_arp.h>
  11#include <linux/inetdevice.h>
  12#include <linux/inet.h>
  13#include <linux/interrupt.h>
  14#include <linux/netpoll.h>
  15#include <linux/sched.h>
  16#include <linux/delay.h>
  17#include <linux/types.h>
  18#include <linux/workqueue.h>
  19#include <linux/netlink.h>
  20#include <linux/net_dropmon.h>
  21#include <linux/percpu.h>
  22#include <linux/timer.h>
  23#include <linux/bitops.h>
  24#include <linux/slab.h>
  25#include <net/genetlink.h>
  26#include <net/netevent.h>
  27
  28#include <trace/events/skb.h>
  29#include <trace/events/napi.h>
  30
  31#include <asm/unaligned.h>
  32
  33#define TRACE_ON 1
  34#define TRACE_OFF 0
  35
  36static void send_dm_alert(struct work_struct *unused);
  37
  38
  39/*
  40 * Globals, our netlink socket pointer
  41 * and the work handle that will send up
  42 * netlink alerts
  43 */
  44static int trace_state = TRACE_OFF;
  45static DEFINE_MUTEX(trace_state_mutex);
  46
  47struct per_cpu_dm_data {
  48        struct work_struct dm_alert_work;
  49        struct sk_buff __rcu *skb;
  50        atomic_t dm_hit_count;
  51        struct timer_list send_timer;
  52        int cpu;
  53};
  54
  55struct dm_hw_stat_delta {
  56        struct net_device *dev;
  57        unsigned long last_rx;
  58        struct list_head list;
  59        struct rcu_head rcu;
  60        unsigned long last_drop_val;
  61};
  62
  63static struct genl_family net_drop_monitor_family = {
  64        .id             = GENL_ID_GENERATE,
  65        .hdrsize        = 0,
  66        .name           = "NET_DM",
  67        .version        = 2,
  68        .maxattr        = NET_DM_CMD_MAX,
  69};
  70
  71static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
  72
  73static int dm_hit_limit = 64;
  74static int dm_delay = 1;
  75static unsigned long dm_hw_check_delta = 2*HZ;
  76static LIST_HEAD(hw_stats_list);
  77
  78static void reset_per_cpu_data(struct per_cpu_dm_data *data)
  79{
  80        size_t al;
  81        struct net_dm_alert_msg *msg;
  82        struct nlattr *nla;
  83        struct sk_buff *skb;
  84        struct sk_buff *oskb = rcu_dereference_protected(data->skb, 1);
  85
  86        al = sizeof(struct net_dm_alert_msg);
  87        al += dm_hit_limit * sizeof(struct net_dm_drop_point);
  88        al += sizeof(struct nlattr);
  89
  90        skb = genlmsg_new(al, GFP_KERNEL);
  91
  92        if (skb) {
  93                genlmsg_put(skb, 0, 0, &net_drop_monitor_family,
  94                                0, NET_DM_CMD_ALERT);
  95                nla = nla_reserve(skb, NLA_UNSPEC,
  96                                  sizeof(struct net_dm_alert_msg));
  97                msg = nla_data(nla);
  98                memset(msg, 0, al);
  99        } else
 100                schedule_work_on(data->cpu, &data->dm_alert_work);
 101
 102        /*
 103         * Don't need to lock this, since we are guaranteed to only
 104         * run this on a single cpu at a time.
 105         * Note also that we only update data->skb if the old and new skb
 106         * pointers don't match.  This ensures that we don't continually call
 107         * synchornize_rcu if we repeatedly fail to alloc a new netlink message.
 108         */
 109        if (skb != oskb) {
 110                rcu_assign_pointer(data->skb, skb);
 111
 112                synchronize_rcu();
 113
 114                atomic_set(&data->dm_hit_count, dm_hit_limit);
 115        }
 116
 117}
 118
 119static void send_dm_alert(struct work_struct *unused)
 120{
 121        struct sk_buff *skb;
 122        struct per_cpu_dm_data *data = &get_cpu_var(dm_cpu_data);
 123
 124        WARN_ON_ONCE(data->cpu != smp_processor_id());
 125
 126        /*
 127         * Grab the skb we're about to send
 128         */
 129        skb = rcu_dereference_protected(data->skb, 1);
 130
 131        /*
 132         * Replace it with a new one
 133         */
 134        reset_per_cpu_data(data);
 135
 136        /*
 137         * Ship it!
 138         */
 139        if (skb)
 140                genlmsg_multicast(skb, 0, NET_DM_GRP_ALERT, GFP_KERNEL);
 141
 142        put_cpu_var(dm_cpu_data);
 143}
 144
 145/*
 146 * This is the timer function to delay the sending of an alert
 147 * in the event that more drops will arrive during the
 148 * hysteresis period.  Note that it operates under the timer interrupt
 149 * so we don't need to disable preemption here
 150 */
 151static void sched_send_work(unsigned long unused)
 152{
 153        struct per_cpu_dm_data *data =  &get_cpu_var(dm_cpu_data);
 154
 155        schedule_work_on(smp_processor_id(), &data->dm_alert_work);
 156
 157        put_cpu_var(dm_cpu_data);
 158}
 159
 160static void trace_drop_common(struct sk_buff *skb, void *location)
 161{
 162        struct net_dm_alert_msg *msg;
 163        struct nlmsghdr *nlh;
 164        struct nlattr *nla;
 165        int i;
 166        struct sk_buff *dskb;
 167        struct per_cpu_dm_data *data = &get_cpu_var(dm_cpu_data);
 168
 169
 170        rcu_read_lock();
 171        dskb = rcu_dereference(data->skb);
 172
 173        if (!dskb)
 174                goto out;
 175
 176        if (!atomic_add_unless(&data->dm_hit_count, -1, 0)) {
 177                /*
 178                 * we're already at zero, discard this hit
 179                 */
 180                goto out;
 181        }
 182
 183        nlh = (struct nlmsghdr *)dskb->data;
 184        nla = genlmsg_data(nlmsg_data(nlh));
 185        msg = nla_data(nla);
 186        for (i = 0; i < msg->entries; i++) {
 187                if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) {
 188                        msg->points[i].count++;
 189                        atomic_inc(&data->dm_hit_count);
 190                        goto out;
 191                }
 192        }
 193
 194        /*
 195         * We need to create a new entry
 196         */
 197        __nla_reserve_nohdr(dskb, sizeof(struct net_dm_drop_point));
 198        nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point));
 199        memcpy(msg->points[msg->entries].pc, &location, sizeof(void *));
 200        msg->points[msg->entries].count = 1;
 201        msg->entries++;
 202
 203        if (!timer_pending(&data->send_timer)) {
 204                data->send_timer.expires = jiffies + dm_delay * HZ;
 205                add_timer_on(&data->send_timer, smp_processor_id());
 206        }
 207
 208out:
 209        rcu_read_unlock();
 210        put_cpu_var(dm_cpu_data);
 211        return;
 212}
 213
 214static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location)
 215{
 216        trace_drop_common(skb, location);
 217}
 218
 219static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi)
 220{
 221        struct dm_hw_stat_delta *new_stat;
 222
 223        /*
 224         * Don't check napi structures with no associated device
 225         */
 226        if (!napi->dev)
 227                return;
 228
 229        rcu_read_lock();
 230        list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
 231                /*
 232                 * only add a note to our monitor buffer if:
 233                 * 1) this is the dev we received on
 234                 * 2) its after the last_rx delta
 235                 * 3) our rx_dropped count has gone up
 236                 */
 237                if ((new_stat->dev == napi->dev)  &&
 238                    (time_after(jiffies, new_stat->last_rx + dm_hw_check_delta)) &&
 239                    (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) {
 240                        trace_drop_common(NULL, NULL);
 241                        new_stat->last_drop_val = napi->dev->stats.rx_dropped;
 242                        new_stat->last_rx = jiffies;
 243                        break;
 244                }
 245        }
 246        rcu_read_unlock();
 247}
 248
 249static int set_all_monitor_traces(int state)
 250{
 251        int rc = 0;
 252        struct dm_hw_stat_delta *new_stat = NULL;
 253        struct dm_hw_stat_delta *temp;
 254
 255        mutex_lock(&trace_state_mutex);
 256
 257        if (state == trace_state) {
 258                rc = -EAGAIN;
 259                goto out_unlock;
 260        }
 261
 262        switch (state) {
 263        case TRACE_ON:
 264                rc |= register_trace_kfree_skb(trace_kfree_skb_hit, NULL);
 265                rc |= register_trace_napi_poll(trace_napi_poll_hit, NULL);
 266                break;
 267        case TRACE_OFF:
 268                rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit, NULL);
 269                rc |= unregister_trace_napi_poll(trace_napi_poll_hit, NULL);
 270
 271                tracepoint_synchronize_unregister();
 272
 273                /*
 274                 * Clean the device list
 275                 */
 276                list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) {
 277                        if (new_stat->dev == NULL) {
 278                                list_del_rcu(&new_stat->list);
 279                                kfree_rcu(new_stat, rcu);
 280                        }
 281                }
 282                break;
 283        default:
 284                rc = 1;
 285                break;
 286        }
 287
 288        if (!rc)
 289                trace_state = state;
 290        else
 291                rc = -EINPROGRESS;
 292
 293out_unlock:
 294        mutex_unlock(&trace_state_mutex);
 295
 296        return rc;
 297}
 298
 299
 300static int net_dm_cmd_config(struct sk_buff *skb,
 301                        struct genl_info *info)
 302{
 303        return -ENOTSUPP;
 304}
 305
 306static int net_dm_cmd_trace(struct sk_buff *skb,
 307                        struct genl_info *info)
 308{
 309        switch (info->genlhdr->cmd) {
 310        case NET_DM_CMD_START:
 311                return set_all_monitor_traces(TRACE_ON);
 312                break;
 313        case NET_DM_CMD_STOP:
 314                return set_all_monitor_traces(TRACE_OFF);
 315                break;
 316        }
 317
 318        return -ENOTSUPP;
 319}
 320
 321static int dropmon_net_event(struct notifier_block *ev_block,
 322                        unsigned long event, void *ptr)
 323{
 324        struct net_device *dev = ptr;
 325        struct dm_hw_stat_delta *new_stat = NULL;
 326        struct dm_hw_stat_delta *tmp;
 327
 328        switch (event) {
 329        case NETDEV_REGISTER:
 330                new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
 331
 332                if (!new_stat)
 333                        goto out;
 334
 335                new_stat->dev = dev;
 336                new_stat->last_rx = jiffies;
 337                mutex_lock(&trace_state_mutex);
 338                list_add_rcu(&new_stat->list, &hw_stats_list);
 339                mutex_unlock(&trace_state_mutex);
 340                break;
 341        case NETDEV_UNREGISTER:
 342                mutex_lock(&trace_state_mutex);
 343                list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) {
 344                        if (new_stat->dev == dev) {
 345                                new_stat->dev = NULL;
 346                                if (trace_state == TRACE_OFF) {
 347                                        list_del_rcu(&new_stat->list);
 348                                        kfree_rcu(new_stat, rcu);
 349                                        break;
 350                                }
 351                        }
 352                }
 353                mutex_unlock(&trace_state_mutex);
 354                break;
 355        }
 356out:
 357        return NOTIFY_DONE;
 358}
 359
 360static struct genl_ops dropmon_ops[] = {
 361        {
 362                .cmd = NET_DM_CMD_CONFIG,
 363                .doit = net_dm_cmd_config,
 364        },
 365        {
 366                .cmd = NET_DM_CMD_START,
 367                .doit = net_dm_cmd_trace,
 368        },
 369        {
 370                .cmd = NET_DM_CMD_STOP,
 371                .doit = net_dm_cmd_trace,
 372        },
 373};
 374
 375static struct notifier_block dropmon_net_notifier = {
 376        .notifier_call = dropmon_net_event
 377};
 378
 379static int __init init_net_drop_monitor(void)
 380{
 381        struct per_cpu_dm_data *data;
 382        int cpu, rc;
 383
 384        printk(KERN_INFO "Initializing network drop monitor service\n");
 385
 386        if (sizeof(void *) > 8) {
 387                printk(KERN_ERR "Unable to store program counters on this arch, Drop monitor failed\n");
 388                return -ENOSPC;
 389        }
 390
 391        rc = genl_register_family_with_ops(&net_drop_monitor_family,
 392                                           dropmon_ops,
 393                                           ARRAY_SIZE(dropmon_ops));
 394        if (rc) {
 395                printk(KERN_ERR "Could not create drop monitor netlink family\n");
 396                return rc;
 397        }
 398
 399        rc = register_netdevice_notifier(&dropmon_net_notifier);
 400        if (rc < 0) {
 401                printk(KERN_CRIT "Failed to register netdevice notifier\n");
 402                goto out_unreg;
 403        }
 404
 405        rc = 0;
 406
 407        for_each_present_cpu(cpu) {
 408                data = &per_cpu(dm_cpu_data, cpu);
 409                data->cpu = cpu;
 410                INIT_WORK(&data->dm_alert_work, send_dm_alert);
 411                init_timer(&data->send_timer);
 412                data->send_timer.data = cpu;
 413                data->send_timer.function = sched_send_work;
 414                reset_per_cpu_data(data);
 415        }
 416
 417
 418        goto out;
 419
 420out_unreg:
 421        genl_unregister_family(&net_drop_monitor_family);
 422out:
 423        return rc;
 424}
 425
 426late_initcall(init_net_drop_monitor);
 427