linux/net/core/sysctl_net_core.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/* -*- linux-c -*-
   3 * sysctl_net_core.c: sysctl interface to net core subsystem.
   4 *
   5 * Begun April 1, 1996, Mike Shaver.
   6 * Added /proc/sys/net/core directory entry (empty =) ). [MS]
   7 */
   8
   9#include <linux/mm.h>
  10#include <linux/sysctl.h>
  11#include <linux/module.h>
  12#include <linux/socket.h>
  13#include <linux/netdevice.h>
  14#include <linux/ratelimit.h>
  15#include <linux/vmalloc.h>
  16#include <linux/init.h>
  17#include <linux/slab.h>
  18#include <linux/kmemleak.h>
  19
  20#include <net/ip.h>
  21#include <net/sock.h>
  22#include <net/net_ratelimit.h>
  23#include <net/busy_poll.h>
  24#include <net/pkt_sched.h>
  25
  26static int zero = 0;
  27static int one = 1;
  28static int min_sndbuf = SOCK_MIN_SNDBUF;
  29static int min_rcvbuf = SOCK_MIN_RCVBUF;
  30static int max_skb_frags = MAX_SKB_FRAGS;
  31
  32static int net_msg_warn;        /* Unused, but still a sysctl */
  33
  34#ifdef CONFIG_RPS
  35static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
  36                                void __user *buffer, size_t *lenp, loff_t *ppos)
  37{
  38        unsigned int orig_size, size;
  39        int ret, i;
  40        struct ctl_table tmp = {
  41                .data = &size,
  42                .maxlen = sizeof(size),
  43                .mode = table->mode
  44        };
  45        struct rps_sock_flow_table *orig_sock_table, *sock_table;
  46        static DEFINE_MUTEX(sock_flow_mutex);
  47
  48        mutex_lock(&sock_flow_mutex);
  49
  50        orig_sock_table = rcu_dereference_protected(rps_sock_flow_table,
  51                                        lockdep_is_held(&sock_flow_mutex));
  52        size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0;
  53
  54        ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
  55
  56        if (write) {
  57                if (size) {
  58                        if (size > 1<<29) {
  59                                /* Enforce limit to prevent overflow */
  60                                mutex_unlock(&sock_flow_mutex);
  61                                return -EINVAL;
  62                        }
  63                        size = roundup_pow_of_two(size);
  64                        if (size != orig_size) {
  65                                sock_table =
  66                                    vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size));
  67                                if (!sock_table) {
  68                                        mutex_unlock(&sock_flow_mutex);
  69                                        return -ENOMEM;
  70                                }
  71                                rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1;
  72                                sock_table->mask = size - 1;
  73                        } else
  74                                sock_table = orig_sock_table;
  75
  76                        for (i = 0; i < size; i++)
  77                                sock_table->ents[i] = RPS_NO_CPU;
  78                } else
  79                        sock_table = NULL;
  80
  81                if (sock_table != orig_sock_table) {
  82                        rcu_assign_pointer(rps_sock_flow_table, sock_table);
  83                        if (sock_table) {
  84                                static_key_slow_inc(&rps_needed);
  85                                static_key_slow_inc(&rfs_needed);
  86                        }
  87                        if (orig_sock_table) {
  88                                static_key_slow_dec(&rps_needed);
  89                                static_key_slow_dec(&rfs_needed);
  90                                synchronize_rcu();
  91                                vfree(orig_sock_table);
  92                        }
  93                }
  94        }
  95
  96        mutex_unlock(&sock_flow_mutex);
  97
  98        return ret;
  99}
 100#endif /* CONFIG_RPS */
 101
 102#ifdef CONFIG_NET_FLOW_LIMIT
 103static DEFINE_MUTEX(flow_limit_update_mutex);
 104
 105static int flow_limit_cpu_sysctl(struct ctl_table *table, int write,
 106                                 void __user *buffer, size_t *lenp,
 107                                 loff_t *ppos)
 108{
 109        struct sd_flow_limit *cur;
 110        struct softnet_data *sd;
 111        cpumask_var_t mask;
 112        int i, len, ret = 0;
 113
 114        if (!alloc_cpumask_var(&mask, GFP_KERNEL))
 115                return -ENOMEM;
 116
 117        if (write) {
 118                ret = cpumask_parse_user(buffer, *lenp, mask);
 119                if (ret)
 120                        goto done;
 121
 122                mutex_lock(&flow_limit_update_mutex);
 123                len = sizeof(*cur) + netdev_flow_limit_table_len;
 124                for_each_possible_cpu(i) {
 125                        sd = &per_cpu(softnet_data, i);
 126                        cur = rcu_dereference_protected(sd->flow_limit,
 127                                     lockdep_is_held(&flow_limit_update_mutex));
 128                        if (cur && !cpumask_test_cpu(i, mask)) {
 129                                RCU_INIT_POINTER(sd->flow_limit, NULL);
 130                                synchronize_rcu();
 131                                kfree(cur);
 132                        } else if (!cur && cpumask_test_cpu(i, mask)) {
 133                                cur = kzalloc_node(len, GFP_KERNEL,
 134                                                   cpu_to_node(i));
 135                                if (!cur) {
 136                                        /* not unwinding previous changes */
 137                                        ret = -ENOMEM;
 138                                        goto write_unlock;
 139                                }
 140                                cur->num_buckets = netdev_flow_limit_table_len;
 141                                rcu_assign_pointer(sd->flow_limit, cur);
 142                        }
 143                }
 144write_unlock:
 145                mutex_unlock(&flow_limit_update_mutex);
 146        } else {
 147                char kbuf[128];
 148
 149                if (*ppos || !*lenp) {
 150                        *lenp = 0;
 151                        goto done;
 152                }
 153
 154                cpumask_clear(mask);
 155                rcu_read_lock();
 156                for_each_possible_cpu(i) {
 157                        sd = &per_cpu(softnet_data, i);
 158                        if (rcu_dereference(sd->flow_limit))
 159                                cpumask_set_cpu(i, mask);
 160                }
 161                rcu_read_unlock();
 162
 163                len = min(sizeof(kbuf) - 1, *lenp);
 164                len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask));
 165                if (!len) {
 166                        *lenp = 0;
 167                        goto done;
 168                }
 169                if (len < *lenp)
 170                        kbuf[len++] = '\n';
 171                if (copy_to_user(buffer, kbuf, len)) {
 172                        ret = -EFAULT;
 173                        goto done;
 174                }
 175                *lenp = len;
 176                *ppos += len;
 177        }
 178
 179done:
 180        free_cpumask_var(mask);
 181        return ret;
 182}
 183
 184static int flow_limit_table_len_sysctl(struct ctl_table *table, int write,
 185                                       void __user *buffer, size_t *lenp,
 186                                       loff_t *ppos)
 187{
 188        unsigned int old, *ptr;
 189        int ret;
 190
 191        mutex_lock(&flow_limit_update_mutex);
 192
 193        ptr = table->data;
 194        old = *ptr;
 195        ret = proc_dointvec(table, write, buffer, lenp, ppos);
 196        if (!ret && write && !is_power_of_2(*ptr)) {
 197                *ptr = old;
 198                ret = -EINVAL;
 199        }
 200
 201        mutex_unlock(&flow_limit_update_mutex);
 202        return ret;
 203}
 204#endif /* CONFIG_NET_FLOW_LIMIT */
 205
 206#ifdef CONFIG_NET_SCHED
 207static int set_default_qdisc(struct ctl_table *table, int write,
 208                             void __user *buffer, size_t *lenp, loff_t *ppos)
 209{
 210        char id[IFNAMSIZ];
 211        struct ctl_table tbl = {
 212                .data = id,
 213                .maxlen = IFNAMSIZ,
 214        };
 215        int ret;
 216
 217        qdisc_get_default(id, IFNAMSIZ);
 218
 219        ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
 220        if (write && ret == 0)
 221                ret = qdisc_set_default(id);
 222        return ret;
 223}
 224#endif
 225
 226static int proc_do_dev_weight(struct ctl_table *table, int write,
 227                           void __user *buffer, size_t *lenp, loff_t *ppos)
 228{
 229        int ret;
 230
 231        ret = proc_dointvec(table, write, buffer, lenp, ppos);
 232        if (ret != 0)
 233                return ret;
 234
 235        dev_rx_weight = weight_p * dev_weight_rx_bias;
 236        dev_tx_weight = weight_p * dev_weight_tx_bias;
 237
 238        return ret;
 239}
 240
 241static int proc_do_rss_key(struct ctl_table *table, int write,
 242                           void __user *buffer, size_t *lenp, loff_t *ppos)
 243{
 244        struct ctl_table fake_table;
 245        char buf[NETDEV_RSS_KEY_LEN * 3];
 246
 247        snprintf(buf, sizeof(buf), "%*phC", NETDEV_RSS_KEY_LEN, netdev_rss_key);
 248        fake_table.data = buf;
 249        fake_table.maxlen = sizeof(buf);
 250        return proc_dostring(&fake_table, write, buffer, lenp, ppos);
 251}
 252
 253static struct ctl_table net_core_table[] = {
 254#ifdef CONFIG_NET
 255        {
 256                .procname       = "wmem_max",
 257                .data           = &sysctl_wmem_max,
 258                .maxlen         = sizeof(int),
 259                .mode           = 0644,
 260                .proc_handler   = proc_dointvec_minmax,
 261                .extra1         = &min_sndbuf,
 262        },
 263        {
 264                .procname       = "rmem_max",
 265                .data           = &sysctl_rmem_max,
 266                .maxlen         = sizeof(int),
 267                .mode           = 0644,
 268                .proc_handler   = proc_dointvec_minmax,
 269                .extra1         = &min_rcvbuf,
 270        },
 271        {
 272                .procname       = "wmem_default",
 273                .data           = &sysctl_wmem_default,
 274                .maxlen         = sizeof(int),
 275                .mode           = 0644,
 276                .proc_handler   = proc_dointvec_minmax,
 277                .extra1         = &min_sndbuf,
 278        },
 279        {
 280                .procname       = "rmem_default",
 281                .data           = &sysctl_rmem_default,
 282                .maxlen         = sizeof(int),
 283                .mode           = 0644,
 284                .proc_handler   = proc_dointvec_minmax,
 285                .extra1         = &min_rcvbuf,
 286        },
 287        {
 288                .procname       = "dev_weight",
 289                .data           = &weight_p,
 290                .maxlen         = sizeof(int),
 291                .mode           = 0644,
 292                .proc_handler   = proc_do_dev_weight,
 293        },
 294        {
 295                .procname       = "dev_weight_rx_bias",
 296                .data           = &dev_weight_rx_bias,
 297                .maxlen         = sizeof(int),
 298                .mode           = 0644,
 299                .proc_handler   = proc_do_dev_weight,
 300        },
 301        {
 302                .procname       = "dev_weight_tx_bias",
 303                .data           = &dev_weight_tx_bias,
 304                .maxlen         = sizeof(int),
 305                .mode           = 0644,
 306                .proc_handler   = proc_do_dev_weight,
 307        },
 308        {
 309                .procname       = "netdev_max_backlog",
 310                .data           = &netdev_max_backlog,
 311                .maxlen         = sizeof(int),
 312                .mode           = 0644,
 313                .proc_handler   = proc_dointvec
 314        },
 315        {
 316                .procname       = "netdev_rss_key",
 317                .data           = &netdev_rss_key,
 318                .maxlen         = sizeof(int),
 319                .mode           = 0444,
 320                .proc_handler   = proc_do_rss_key,
 321        },
 322#ifdef CONFIG_BPF_JIT
 323        {
 324                .procname       = "bpf_jit_enable",
 325                .data           = &bpf_jit_enable,
 326                .maxlen         = sizeof(int),
 327                .mode           = 0644,
 328                .proc_handler   = proc_dointvec
 329        },
 330# ifdef CONFIG_HAVE_EBPF_JIT
 331        {
 332                .procname       = "bpf_jit_harden",
 333                .data           = &bpf_jit_harden,
 334                .maxlen         = sizeof(int),
 335                .mode           = 0600,
 336                .proc_handler   = proc_dointvec,
 337        },
 338        {
 339                .procname       = "bpf_jit_kallsyms",
 340                .data           = &bpf_jit_kallsyms,
 341                .maxlen         = sizeof(int),
 342                .mode           = 0600,
 343                .proc_handler   = proc_dointvec,
 344        },
 345# endif
 346#endif
 347        {
 348                .procname       = "netdev_tstamp_prequeue",
 349                .data           = &netdev_tstamp_prequeue,
 350                .maxlen         = sizeof(int),
 351                .mode           = 0644,
 352                .proc_handler   = proc_dointvec
 353        },
 354        {
 355                .procname       = "message_cost",
 356                .data           = &net_ratelimit_state.interval,
 357                .maxlen         = sizeof(int),
 358                .mode           = 0644,
 359                .proc_handler   = proc_dointvec_jiffies,
 360        },
 361        {
 362                .procname       = "message_burst",
 363                .data           = &net_ratelimit_state.burst,
 364                .maxlen         = sizeof(int),
 365                .mode           = 0644,
 366                .proc_handler   = proc_dointvec,
 367        },
 368        {
 369                .procname       = "optmem_max",
 370                .data           = &sysctl_optmem_max,
 371                .maxlen         = sizeof(int),
 372                .mode           = 0644,
 373                .proc_handler   = proc_dointvec
 374        },
 375        {
 376                .procname       = "tstamp_allow_data",
 377                .data           = &sysctl_tstamp_allow_data,
 378                .maxlen         = sizeof(int),
 379                .mode           = 0644,
 380                .proc_handler   = proc_dointvec_minmax,
 381                .extra1         = &zero,
 382                .extra2         = &one
 383        },
 384#ifdef CONFIG_RPS
 385        {
 386                .procname       = "rps_sock_flow_entries",
 387                .maxlen         = sizeof(int),
 388                .mode           = 0644,
 389                .proc_handler   = rps_sock_flow_sysctl
 390        },
 391#endif
 392#ifdef CONFIG_NET_FLOW_LIMIT
 393        {
 394                .procname       = "flow_limit_cpu_bitmap",
 395                .mode           = 0644,
 396                .proc_handler   = flow_limit_cpu_sysctl
 397        },
 398        {
 399                .procname       = "flow_limit_table_len",
 400                .data           = &netdev_flow_limit_table_len,
 401                .maxlen         = sizeof(int),
 402                .mode           = 0644,
 403                .proc_handler   = flow_limit_table_len_sysctl
 404        },
 405#endif /* CONFIG_NET_FLOW_LIMIT */
 406#ifdef CONFIG_NET_RX_BUSY_POLL
 407        {
 408                .procname       = "busy_poll",
 409                .data           = &sysctl_net_busy_poll,
 410                .maxlen         = sizeof(unsigned int),
 411                .mode           = 0644,
 412                .proc_handler   = proc_dointvec_minmax,
 413                .extra1         = &zero,
 414        },
 415        {
 416                .procname       = "busy_read",
 417                .data           = &sysctl_net_busy_read,
 418                .maxlen         = sizeof(unsigned int),
 419                .mode           = 0644,
 420                .proc_handler   = proc_dointvec_minmax,
 421                .extra1         = &zero,
 422        },
 423#endif
 424#ifdef CONFIG_NET_SCHED
 425        {
 426                .procname       = "default_qdisc",
 427                .mode           = 0644,
 428                .maxlen         = IFNAMSIZ,
 429                .proc_handler   = set_default_qdisc
 430        },
 431#endif
 432#endif /* CONFIG_NET */
 433        {
 434                .procname       = "netdev_budget",
 435                .data           = &netdev_budget,
 436                .maxlen         = sizeof(int),
 437                .mode           = 0644,
 438                .proc_handler   = proc_dointvec
 439        },
 440        {
 441                .procname       = "warnings",
 442                .data           = &net_msg_warn,
 443                .maxlen         = sizeof(int),
 444                .mode           = 0644,
 445                .proc_handler   = proc_dointvec
 446        },
 447        {
 448                .procname       = "max_skb_frags",
 449                .data           = &sysctl_max_skb_frags,
 450                .maxlen         = sizeof(int),
 451                .mode           = 0644,
 452                .proc_handler   = proc_dointvec_minmax,
 453                .extra1         = &one,
 454                .extra2         = &max_skb_frags,
 455        },
 456        {
 457                .procname       = "netdev_budget_usecs",
 458                .data           = &netdev_budget_usecs,
 459                .maxlen         = sizeof(unsigned int),
 460                .mode           = 0644,
 461                .proc_handler   = proc_dointvec_minmax,
 462                .extra1         = &zero,
 463        },
 464        { }
 465};
 466
 467static struct ctl_table netns_core_table[] = {
 468        {
 469                .procname       = "somaxconn",
 470                .data           = &init_net.core.sysctl_somaxconn,
 471                .maxlen         = sizeof(int),
 472                .mode           = 0644,
 473                .extra1         = &zero,
 474                .proc_handler   = proc_dointvec_minmax
 475        },
 476        { }
 477};
 478
 479static __net_init int sysctl_core_net_init(struct net *net)
 480{
 481        struct ctl_table *tbl;
 482
 483        tbl = netns_core_table;
 484        if (!net_eq(net, &init_net)) {
 485                tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL);
 486                if (tbl == NULL)
 487                        goto err_dup;
 488
 489                tbl[0].data = &net->core.sysctl_somaxconn;
 490
 491                /* Don't export any sysctls to unprivileged users */
 492                if (net->user_ns != &init_user_ns) {
 493                        tbl[0].procname = NULL;
 494                }
 495        }
 496
 497        net->core.sysctl_hdr = register_net_sysctl(net, "net/core", tbl);
 498        if (net->core.sysctl_hdr == NULL)
 499                goto err_reg;
 500
 501        return 0;
 502
 503err_reg:
 504        if (tbl != netns_core_table)
 505                kfree(tbl);
 506err_dup:
 507        return -ENOMEM;
 508}
 509
 510static __net_exit void sysctl_core_net_exit(struct net *net)
 511{
 512        struct ctl_table *tbl;
 513
 514        tbl = net->core.sysctl_hdr->ctl_table_arg;
 515        unregister_net_sysctl_table(net->core.sysctl_hdr);
 516        BUG_ON(tbl == netns_core_table);
 517        kfree(tbl);
 518}
 519
 520static __net_initdata struct pernet_operations sysctl_core_ops = {
 521        .init = sysctl_core_net_init,
 522        .exit = sysctl_core_net_exit,
 523};
 524
 525static __init int sysctl_core_init(void)
 526{
 527        register_net_sysctl(&init_net, "net/core", net_core_table);
 528        return register_pernet_subsys(&sysctl_core_ops);
 529}
 530
 531fs_initcall(sysctl_core_init);
 532