summaryrefslogtreecommitdiffstats
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile6
-rw-r--r--net/core/dev.c316
-rw-r--r--net/core/dev_addr_lists.c3
-rw-r--r--net/core/dst.c2
-rw-r--r--net/core/ethtool.c712
-rw-r--r--net/core/flow.c12
-rw-r--r--net/core/flow_dissector.c143
-rw-r--r--net/core/neighbour.c227
-rw-r--r--net/core/net-sysfs.c324
-rw-r--r--net/core/netpoll.c10
-rw-r--r--net/core/netprio_cgroup.c344
-rw-r--r--net/core/pktgen.c17
-rw-r--r--net/core/request_sock.c7
-rw-r--r--net/core/rtnetlink.c25
-rw-r--r--net/core/secure_seq.c8
-rw-r--r--net/core/skbuff.c91
-rw-r--r--net/core/sock.c203
-rw-r--r--net/core/sock_diag.c192
-rw-r--r--net/core/sysctl_net_core.c9
19 files changed, 1659 insertions, 992 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index 0d357b1c4e5..674641b13ae 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -3,12 +3,13 @@
#
obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \
- gen_stats.o gen_estimator.o net_namespace.o secure_seq.o
+ gen_stats.o gen_estimator.o net_namespace.o secure_seq.o flow_dissector.o
obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
- neighbour.o rtnetlink.o utils.o link_watch.o filter.o
+ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
+ sock_diag.o
obj-$(CONFIG_XFRM) += flow.o
obj-y += net-sysfs.o
@@ -19,3 +20,4 @@ obj-$(CONFIG_FIB_RULES) += fib_rules.o
obj-$(CONFIG_TRACEPOINTS) += net-traces.o
obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
+obj-$(CONFIG_NETPRIO_CGROUP) += netprio_cgroup.o
diff --git a/net/core/dev.c b/net/core/dev.c
index 6ba50a1e404..f494675471a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -133,10 +133,9 @@
#include <linux/pci.h>
#include <linux/inetdevice.h>
#include <linux/cpu_rmap.h>
-#include <linux/if_tunnel.h>
-#include <linux/if_pppox.h>
-#include <linux/ppp_defs.h>
#include <linux/net_tstamp.h>
+#include <linux/jump_label.h>
+#include <net/flow_keys.h>
#include "net-sysfs.h"
@@ -1320,8 +1319,6 @@ EXPORT_SYMBOL(dev_close);
*/
void dev_disable_lro(struct net_device *dev)
{
- u32 flags;
-
/*
* If we're trying to disable lro on a vlan device
* use the underlying physical device instead
@@ -1329,15 +1326,9 @@ void dev_disable_lro(struct net_device *dev)
if (is_vlan_dev(dev))
dev = vlan_dev_real_dev(dev);
- if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
- flags = dev->ethtool_ops->get_flags(dev);
- else
- flags = ethtool_op_get_flags(dev);
+ dev->wanted_features &= ~NETIF_F_LRO;
+ netdev_update_features(dev);
- if (!(flags & ETH_FLAG_LRO))
- return;
-
- __ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
if (unlikely(dev->features & NETIF_F_LRO))
netdev_WARN(dev, "failed to disable LRO!\n");
}
@@ -1396,7 +1387,7 @@ rollback:
for_each_net(net) {
for_each_netdev(net, dev) {
if (dev == last)
- break;
+ goto outroll;
if (dev->flags & IFF_UP) {
nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
@@ -1407,6 +1398,7 @@ rollback:
}
}
+outroll:
raw_notifier_chain_unregister(&netdev_chain, nb);
goto unlock;
}
@@ -1449,34 +1441,55 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
}
EXPORT_SYMBOL(call_netdevice_notifiers);
-/* When > 0 there are consumers of rx skb time stamps */
-static atomic_t netstamp_needed = ATOMIC_INIT(0);
+static struct jump_label_key netstamp_needed __read_mostly;
+#ifdef HAVE_JUMP_LABEL
+/* We are not allowed to call jump_label_dec() from irq context
+ * If net_disable_timestamp() is called from irq context, defer the
+ * jump_label_dec() calls.
+ */
+static atomic_t netstamp_needed_deferred;
+#endif
void net_enable_timestamp(void)
{
- atomic_inc(&netstamp_needed);
+#ifdef HAVE_JUMP_LABEL
+ int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
+
+ if (deferred) {
+ while (--deferred)
+ jump_label_dec(&netstamp_needed);
+ return;
+ }
+#endif
+ WARN_ON(in_interrupt());
+ jump_label_inc(&netstamp_needed);
}
EXPORT_SYMBOL(net_enable_timestamp);
void net_disable_timestamp(void)
{
- atomic_dec(&netstamp_needed);
+#ifdef HAVE_JUMP_LABEL
+ if (in_interrupt()) {
+ atomic_inc(&netstamp_needed_deferred);
+ return;
+ }
+#endif
+ jump_label_dec(&netstamp_needed);
}
EXPORT_SYMBOL(net_disable_timestamp);
static inline void net_timestamp_set(struct sk_buff *skb)
{
- if (atomic_read(&netstamp_needed))
+ skb->tstamp.tv64 = 0;
+ if (static_branch(&netstamp_needed))
__net_timestamp(skb);
- else
- skb->tstamp.tv64 = 0;
}
-static inline void net_timestamp_check(struct sk_buff *skb)
-{
- if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
- __net_timestamp(skb);
-}
+#define net_timestamp_check(COND, SKB) \
+ if (static_branch(&netstamp_needed)) { \
+ if ((COND) && !(SKB)->tstamp.tv64) \
+ __net_timestamp(SKB); \
+ } \
static int net_hwtstamp_validate(struct ifreq *ifr)
{
@@ -1923,7 +1936,8 @@ EXPORT_SYMBOL(skb_checksum_help);
* It may return NULL if the skb requires no segmentation. This is
* only possible when GSO is used for verifying header integrity.
*/
-struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
+struct sk_buff *skb_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
struct packet_type *ptype;
@@ -1953,9 +1967,9 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
dev->ethtool_ops->get_drvinfo(dev, &info);
- WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
- info.driver, dev ? dev->features : 0L,
- skb->sk ? skb->sk->sk_route_caps : 0L,
+ WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d ip_summed=%d\n",
+ info.driver, dev ? &dev->features : NULL,
+ skb->sk ? &skb->sk->sk_route_caps : NULL,
skb->len, skb->data_len, skb->ip_summed);
if (skb_header_cloned(skb) &&
@@ -2064,7 +2078,7 @@ static void dev_gso_skb_destructor(struct sk_buff *skb)
* This function segments the given skb and stores the list of segments
* in skb->next.
*/
-static int dev_gso_segment(struct sk_buff *skb, int features)
+static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
{
struct sk_buff *segs;
@@ -2103,7 +2117,7 @@ static inline void skb_orphan_try(struct sk_buff *skb)
}
}
-static bool can_checksum_protocol(unsigned long features, __be16 protocol)
+static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
{
return ((features & NETIF_F_GEN_CSUM) ||
((features & NETIF_F_V4_CSUM) &&
@@ -2114,7 +2128,8 @@ static bool can_checksum_protocol(unsigned long features, __be16 protocol)
protocol == htons(ETH_P_FCOE)));
}
-static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
+static netdev_features_t harmonize_features(struct sk_buff *skb,
+ __be16 protocol, netdev_features_t features)
{
if (!can_checksum_protocol(features, protocol)) {
features &= ~NETIF_F_ALL_CSUM;
@@ -2126,10 +2141,10 @@ static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features
return features;
}
-u32 netif_skb_features(struct sk_buff *skb)
+netdev_features_t netif_skb_features(struct sk_buff *skb)
{
__be16 protocol = skb->protocol;
- u32 features = skb->dev->features;
+ netdev_features_t features = skb->dev->features;
if (protocol == htons(ETH_P_8021Q)) {
struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
@@ -2175,7 +2190,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
unsigned int skb_len;
if (likely(!skb->next)) {
- u32 features;
+ netdev_features_t features;
/*
* If device doesn't need skb->dst, release it right now while
@@ -2256,7 +2271,7 @@ gso:
return rc;
}
txq_trans_update(txq);
- if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
+ if (unlikely(netif_xmit_stopped(txq) && skb->next))
return NETDEV_TX_BUSY;
} while (skb->next);
@@ -2456,6 +2471,18 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
return rc;
}
+#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
+static void skb_update_prio(struct sk_buff *skb)
+{
+ struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
+
+ if ((!skb->priority) && (skb->sk) && map)
+ skb->priority = map->priomap[skb->sk->sk_cgrp_prioidx];
+}
+#else
+#define skb_update_prio(skb)
+#endif
+
static DEFINE_PER_CPU(int, xmit_recursion);
#define RECURSION_LIMIT 10
@@ -2496,6 +2523,8 @@ int dev_queue_xmit(struct sk_buff *skb)
*/
rcu_read_lock_bh();
+ skb_update_prio(skb);
+
txq = dev_pick_tx(dev, skb);
q = rcu_dereference_bh(txq->qdisc);
@@ -2530,7 +2559,7 @@ int dev_queue_xmit(struct sk_buff *skb)
HARD_TX_LOCK(dev, txq, cpu);
- if (!netif_tx_queue_stopped(txq)) {
+ if (!netif_xmit_stopped(txq)) {
__this_cpu_inc(xmit_recursion);
rc = dev_hard_start_xmit(skb, dev, txq);
__this_cpu_dec(xmit_recursion);
@@ -2591,123 +2620,28 @@ static inline void ____napi_schedule(struct softnet_data *sd,
*/
void __skb_get_rxhash(struct sk_buff *skb)
{
- int nhoff, hash = 0, poff;
- const struct ipv6hdr *ip6;
- const struct iphdr *ip;
- const struct vlan_hdr *vlan;
- u8 ip_proto;
- u32 addr1, addr2;
- u16 proto;
- union {
- u32 v32;
- u16 v16[2];
- } ports;
-
- nhoff = skb_network_offset(skb);
- proto = skb->protocol;
-
-again:
- switch (proto) {
- case __constant_htons(ETH_P_IP):
-ip:
- if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
- goto done;
-
- ip = (const struct iphdr *) (skb->data + nhoff);
- if (ip_is_fragment(ip))
- ip_proto = 0;
- else
- ip_proto = ip->protocol;
- addr1 = (__force u32) ip->saddr;
- addr2 = (__force u32) ip->daddr;
- nhoff += ip->ihl * 4;
- break;
- case __constant_htons(ETH_P_IPV6):
-ipv6:
- if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
- goto done;
-
- ip6 = (const struct ipv6hdr *) (skb->data + nhoff);
- ip_proto = ip6->nexthdr;
- addr1 = (__force u32) ip6->saddr.s6_addr32[3];
- addr2 = (__force u32) ip6->daddr.s6_addr32[3];
- nhoff += 40;
- break;
- case __constant_htons(ETH_P_8021Q):
- if (!pskb_may_pull(skb, sizeof(*vlan) + nhoff))
- goto done;
- vlan = (const struct vlan_hdr *) (skb->data + nhoff);
- proto = vlan->h_vlan_encapsulated_proto;
- nhoff += sizeof(*vlan);
- goto again;
- case __constant_htons(ETH_P_PPP_SES):
- if (!pskb_may_pull(skb, PPPOE_SES_HLEN + nhoff))
- goto done;
- proto = *((__be16 *) (skb->data + nhoff +
- sizeof(struct pppoe_hdr)));
- nhoff += PPPOE_SES_HLEN;
- switch (proto) {
- case __constant_htons(PPP_IP):
- goto ip;
- case __constant_htons(PPP_IPV6):
- goto ipv6;
- default:
- goto done;
- }
- default:
- goto done;
- }
-
- switch (ip_proto) {
- case IPPROTO_GRE:
- if (pskb_may_pull(skb, nhoff + 16)) {
- u8 *h = skb->data + nhoff;
- __be16 flags = *(__be16 *)h;
+ struct flow_keys keys;
+ u32 hash;
- /*
- * Only look inside GRE if version zero and no
- * routing
- */
- if (!(flags & (GRE_VERSION|GRE_ROUTING))) {
- proto = *(__be16 *)(h + 2);
- nhoff += 4;
- if (flags & GRE_CSUM)
- nhoff += 4;
- if (flags & GRE_KEY)
- nhoff += 4;
- if (flags & GRE_SEQ)
- nhoff += 4;
- goto again;
- }
- }
- break;
- case IPPROTO_IPIP:
- goto again;
- default:
- break;
- }
+ if (!skb_flow_dissect(skb, &keys))
+ return;
- ports.v32 = 0;
- poff = proto_ports_offset(ip_proto);
- if (poff >= 0) {
- nhoff += poff;
- if (pskb_may_pull(skb, nhoff + 4)) {
- ports.v32 = * (__force u32 *) (skb->data + nhoff);
- if (ports.v16[1] < ports.v16[0])
- swap(ports.v16[0], ports.v16[1]);
- skb->l4_rxhash = 1;
- }
+ if (keys.ports) {
+ if ((__force u16)keys.port16[1] < (__force u16)keys.port16[0])
+ swap(keys.port16[0], keys.port16[1]);
+ skb->l4_rxhash = 1;
}
/* get a consistent hash (same value on both flow directions) */
- if (addr2 < addr1)
- swap(addr1, addr2);
+ if ((__force u32)keys.dst < (__force u32)keys.src)
+ swap(keys.dst, keys.src);
- hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
+ hash = jhash_3words((__force u32)keys.dst,
+ (__force u32)keys.src,
+ (__force u32)keys.ports, hashrnd);
if (!hash)
hash = 1;
-done:
skb->rxhash = hash;
}
EXPORT_SYMBOL(__skb_get_rxhash);
@@ -2718,6 +2652,8 @@ EXPORT_SYMBOL(__skb_get_rxhash);
struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
EXPORT_SYMBOL(rps_sock_flow_table);
+struct jump_label_key rps_needed __read_mostly;
+
static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_dev_flow *rflow, u16 next_cpu)
@@ -2997,12 +2933,11 @@ int netif_rx(struct sk_buff *skb)
if (netpoll_rx(skb))
return NET_RX_DROP;
- if (netdev_tstamp_prequeue)
- net_timestamp_check(skb);
+ net_timestamp_check(netdev_tstamp_prequeue, skb);
trace_netif_rx(skb);
#ifdef CONFIG_RPS
- {
+ if (static_branch(&rps_needed)) {
struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu;
@@ -3017,14 +2952,13 @@ int netif_rx(struct sk_buff *skb)
rcu_read_unlock();
preempt_enable();
- }
-#else
+ } else
+#endif
{
unsigned int qtail;
ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
put_cpu();
}
-#endif
return ret;
}
EXPORT_SYMBOL(netif_rx);
@@ -3230,8 +3164,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
int ret = NET_RX_DROP;
__be16 type;
- if (!netdev_tstamp_prequeue)
- net_timestamp_check(skb);
+ net_timestamp_check(!netdev_tstamp_prequeue, skb);
trace_netif_receive_skb(skb);
@@ -3362,14 +3295,13 @@ out:
*/
int netif_receive_skb(struct sk_buff *skb)
{
- if (netdev_tstamp_prequeue)
- net_timestamp_check(skb);
+ net_timestamp_check(netdev_tstamp_prequeue, skb);
if (skb_defer_rx_timestamp(skb))
return NET_RX_SUCCESS;
#ifdef CONFIG_RPS
- {
+ if (static_branch(&rps_needed)) {
struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu, ret;
@@ -3380,16 +3312,12 @@ int netif_receive_skb(struct sk_buff *skb)
if (cpu >= 0) {
ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
rcu_read_unlock();
- } else {
- rcu_read_unlock();
- ret = __netif_receive_skb(skb);
+ return ret;
}
-
- return ret;
+ rcu_read_unlock();
}
-#else
- return __netif_receive_skb(skb);
#endif
+ return __netif_receive_skb(skb);
}
EXPORT_SYMBOL(netif_receive_skb);
@@ -4282,6 +4210,12 @@ static int dev_seq_open(struct inode *inode, struct file *file)
sizeof(struct dev_iter_state));
}
+int dev_seq_open_ops(struct inode *inode, struct file *file,
+ const struct seq_operations *ops)
+{
+ return seq_open_net(inode, file, ops, sizeof(struct dev_iter_state));
+}
+
static const struct file_operations dev_seq_fops = {
.owner = THIS_MODULE,
.open = dev_seq_open,
@@ -4532,7 +4466,7 @@ static void dev_change_rx_flags(struct net_device *dev, int flags)
static int __dev_set_promiscuity(struct net_device *dev, int inc)
{
- unsigned short old_flags = dev->flags;
+ unsigned int old_flags = dev->flags;
uid_t uid;
gid_t gid;
@@ -4589,7 +4523,7 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc)
*/
int dev_set_promiscuity(struct net_device *dev, int inc)
{
- unsigned short old_flags = dev->flags;
+ unsigned int old_flags = dev->flags;
int err;
err = __dev_set_promiscuity(dev, inc);
@@ -4616,7 +4550,7 @@ EXPORT_SYMBOL(dev_set_promiscuity);
int dev_set_allmulti(struct net_device *dev, int inc)
{
- unsigned short old_flags = dev->flags;
+ unsigned int old_flags = dev->flags;
ASSERT_RTNL();
@@ -4719,7 +4653,7 @@ EXPORT_SYMBOL(dev_get_flags);
int __dev_change_flags(struct net_device *dev, unsigned int flags)
{
- int old_flags = dev->flags;
+ unsigned int old_flags = dev->flags;
int ret;
ASSERT_RTNL();
@@ -4802,10 +4736,10 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
* Change settings on device based state flags. The flags are
* in the userspace exported format.
*/
-int dev_change_flags(struct net_device *dev, unsigned flags)
+int dev_change_flags(struct net_device *dev, unsigned int flags)
{
- int ret, changes;
- int old_flags = dev->flags;
+ int ret;
+ unsigned int changes, old_flags = dev->flags;
ret = __dev_change_flags(dev, flags);
if (ret < 0)
@@ -5362,7 +5296,8 @@ static void rollback_registered(struct net_device *dev)
list_del(&single);
}
-static u32 netdev_fix_features(struct net_device *dev, u32 features)
+static netdev_features_t netdev_fix_features(struct net_device *dev,
+ netdev_features_t features)
{
/* Fix illegal checksum combinations */
if ((features & NETIF_F_HW_CSUM) &&
@@ -5371,12 +5306,6 @@ static u32 netdev_fix_features(struct net_device *dev, u32 features)
features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
}
- if ((features & NETIF_F_NO_CSUM) &&
- (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
- netdev_warn(dev, "mixed no checksumming and other settings.\n");
- features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
- }
-
/* Fix illegal SG+CSUM combinations. */
if ((features & NETIF_F_SG) &&
!(features & NETIF_F_ALL_CSUM)) {
@@ -5424,7 +5353,7 @@ static u32 netdev_fix_features(struct net_device *dev, u32 features)
int __netdev_update_features(struct net_device *dev)
{
- u32 features;
+ netdev_features_t features;
int err = 0;
ASSERT_RTNL();
@@ -5440,16 +5369,16 @@ int __netdev_update_features(struct net_device *dev)
if (dev->features == features)
return 0;
- netdev_dbg(dev, "Features changed: 0x%08x -> 0x%08x\n",
- dev->features, features);
+ netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
+ &dev->features, &features);
if (dev->netdev_ops->ndo_set_features)
err = dev->netdev_ops->ndo_set_features(dev, features);
if (unlikely(err < 0)) {
netdev_err(dev,
- "set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
- err, features, dev->features);
+ "set_features() failed (%d); wanted %pNF, left %pNF\n",
+ err, &features, &dev->features);
return -1;
}
@@ -5548,6 +5477,9 @@ static void netdev_init_one_queue(struct net_device *dev,
queue->xmit_lock_owner = -1;
netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
queue->dev = dev;
+#ifdef CONFIG_BQL
+ dql_init(&queue->dql, HZ);
+#endif
}
static int netif_alloc_netdev_queues(struct net_device *dev)
@@ -5633,11 +5565,12 @@ int register_netdevice(struct net_device *dev)
dev->wanted_features = dev->features & dev->hw_features;
/* Turn on no cache copy if HW is doing checksum */
- dev->hw_features |= NETIF_F_NOCACHE_COPY;
- if ((dev->features & NETIF_F_ALL_CSUM) &&
- !(dev->features & NETIF_F_NO_CSUM)) {
- dev->wanted_features |= NETIF_F_NOCACHE_COPY;
- dev->features |= NETIF_F_NOCACHE_COPY;
+ if (!(dev->flags & IFF_LOOPBACK)) {
+ dev->hw_features |= NETIF_F_NOCACHE_COPY;
+ if (dev->features & NETIF_F_ALL_CSUM) {
+ dev->wanted_features |= NETIF_F_NOCACHE_COPY;
+ dev->features |= NETIF_F_NOCACHE_COPY;
+ }
}
/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
@@ -6373,7 +6306,8 @@ static int dev_cpu_callback(struct notifier_block *nfb,
* @one to the master device with current feature set @all. Will not
* enable anything that is off in @mask. Returns the new feature set.
*/
-u32 netdev_increment_features(u32 all, u32 one, u32 mask)
+netdev_features_t netdev_increment_features(netdev_features_t all,
+ netdev_features_t one, netdev_features_t mask)
{
if (mask & NETIF_F_GEN_CSUM)
mask |= NETIF_F_ALL_CSUM;
@@ -6382,10 +6316,6 @@ u32 netdev_increment_features(u32 all, u32 one, u32 mask)
all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
all &= one | ~NETIF_F_ALL_FOR_ALL;
- /* If device needs checksumming, downgrade to it. */
- if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM))
- all &= ~NETIF_F_NO_CSUM;
-
/* If one device supports hw checksumming, set for all. */
if (all & NETIF_F_GEN_CSUM)
all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index 277faef9148..febba516db6 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -696,8 +696,7 @@ static const struct seq_operations dev_mc_seq_ops = {
static int dev_mc_seq_open(struct inode *inode, struct file *file)
{
- return seq_open_net(inode, file, &dev_mc_seq_ops,
- sizeof(struct seq_net_private));
+ return dev_seq_open_ops(inode, file, &dev_mc_seq_ops);
}
static const struct file_operations dev_mc_seq_fops = {
diff --git a/net/core/dst.c b/net/core/dst.c
index d5e2c4c0910..43d94cedbf7 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -366,7 +366,7 @@ static void dst_ifdown(struct dst_entry *dst, struct net_device *dev,
dev_hold(dst->dev);
dev_put(dev);
rcu_read_lock();
- neigh = dst_get_neighbour(dst);
+ neigh = dst_get_neighbour_noref(dst);
if (neigh && neigh->dev == dev) {
neigh->dev = dst->dev;
dev_hold(dst->dev);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index f4448170712..921aa2b4b41 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -36,235 +36,44 @@ u32 ethtool_op_get_link(struct net_device *dev)
}
EXPORT_SYMBOL(ethtool_op_get_link);
-u32 ethtool_op_get_tx_csum(struct net_device *dev)
-{
- return (dev->features & NETIF_F_ALL_CSUM) != 0;
-}
-EXPORT_SYMBOL(ethtool_op_get_tx_csum);
-
-int ethtool_op_set_tx_csum(struct net_device *dev, u32 data)
-{
- if (data)
- dev->features |= NETIF_F_IP_CSUM;
- else
- dev->features &= ~NETIF_F_IP_CSUM;
-
- return 0;
-}
-EXPORT_SYMBOL(ethtool_op_set_tx_csum);
-
-int ethtool_op_set_tx_hw_csum(struct net_device *dev, u32 data)
-{
- if (data)
- dev->features |= NETIF_F_HW_CSUM;
- else
- dev->features &= ~NETIF_F_HW_CSUM;
-
- return 0;
-}
-EXPORT_SYMBOL(ethtool_op_set_tx_hw_csum);
-
-int ethtool_op_set_tx_ipv6_csum(struct net_device *dev, u32 data)
-{
- if (data)
- dev->features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
- else
- dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
-
- return 0;
-}
-EXPORT_SYMBOL(ethtool_op_set_tx_ipv6_csum);
-
-u32 ethtool_op_get_sg(struct net_device *dev)
-{
- return (dev->features & NETIF_F_SG) != 0;
-}
-EXPORT_SYMBOL(ethtool_op_get_sg);
-
-int ethtool_op_set_sg(struct net_device *dev, u32 data)
-{
- if (data)
- dev->features |= NETIF_F_SG;
- else
- dev->features &= ~NETIF_F_SG;
-
- return 0;
-}
-EXPORT_SYMBOL(ethtool_op_set_sg);
-
-u32 ethtool_op_get_tso(struct net_device *dev)
-{
- return (dev->features & NETIF_F_TSO) != 0;
-}
-EXPORT_SYMBOL(ethtool_op_get_tso);
-
-int ethtool_op_set_tso(struct net_device *dev, u32 data)
-{
- if (data)
- dev->features |= NETIF_F_TSO;
- else
- dev->features &= ~NETIF_F_TSO;
-
- return 0;
-}
-EXPORT_SYMBOL(ethtool_op_set_tso);
-
-u32 ethtool_op_get_ufo(struct net_device *dev)
-{
- return (dev->features & NETIF_F_UFO) != 0;
-}
-EXPORT_SYMBOL(ethtool_op_get_ufo);
-
-int ethtool_op_set_ufo(struct net_device *dev, u32 data)
-{
- if (data)
- dev->features |= NETIF_F_UFO;
- else
- dev->features &= ~NETIF_F_UFO;
- return 0;
-}
-EXPORT_SYMBOL(ethtool_op_set_ufo);
-
-/* the following list of flags are the same as their associated
- * NETIF_F_xxx values in include/linux/netdevice.h
- */
-static const u32 flags_dup_features =
- (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | ETH_FLAG_NTUPLE |
- ETH_FLAG_RXHASH);
-
-u32 ethtool_op_get_flags(struct net_device *dev)
-{
- /* in the future, this function will probably contain additional
- * handling for flags which are not so easily handled
- * by a simple masking operation
- */
-
- return dev->features & flags_dup_features;
-}
-EXPORT_SYMBOL(ethtool_op_get_flags);
-
-/* Check if device can enable (or disable) particular feature coded in "data"
- * argument. Flags "supported" describe features that can be toggled by device.
- * If feature can not be toggled, it state (enabled or disabled) must match
- * hardcoded device features state, otherwise flags are marked as invalid.
- */
-bool ethtool_invalid_flags(struct net_device *dev, u32 data, u32 supported)
-{
- u32 features = dev->features & flags_dup_features;
- /* "data" can contain only flags_dup_features bits,
- * see __ethtool_set_flags */
-
- return (features & ~supported) != (data & ~supported);
-}
-EXPORT_SYMBOL(ethtool_invalid_flags);
-
-int ethtool_op_set_flags(struct net_device *dev, u32 data, u32 supported)
-{
- if (ethtool_invalid_flags(dev, data, supported))
- return -EINVAL;
-
- dev->features = ((dev->features & ~flags_dup_features) |
- (data & flags_dup_features));
- return 0;
-}
-EXPORT_SYMBOL(ethtool_op_set_flags);
-
/* Handlers for each ethtool command */
-#define ETHTOOL_DEV_FEATURE_WORDS 1
-
-static void ethtool_get_features_compat(struct net_device *dev,
- struct ethtool_get_features_block *features)
-{
- if (!dev->ethtool_ops)
- return;
-
- /* getting RX checksum */
- if (dev->ethtool_ops->get_rx_csum)
- if (dev->ethtool_ops->get_rx_csum(dev))
- features[0].active |= NETIF_F_RXCSUM;
-
- /* mark legacy-changeable features */
- if (dev->ethtool_ops->set_sg)
- features[0].available |= NETIF_F_SG;
- if (dev->ethtool_ops->set_tx_csum)
- features[0].available |= NETIF_F_ALL_CSUM;
- if (dev->ethtool_ops->set_tso)
- features[0].available |= NETIF_F_ALL_TSO;
- if (dev->ethtool_ops->set_rx_csum)
- features[0].available |= NETIF_F_RXCSUM;
- if (dev->ethtool_ops->set_flags)
- features[0].available |= flags_dup_features;
-}
-
-static int ethtool_set_feature_compat(struct net_device *dev,
- int (*legacy_set)(struct net_device *, u32),
- struct ethtool_set_features_block *features, u32 mask)
-{
- u32 do_set;
-
- if (!legacy_set)
- return 0;
-
- if (!(features[0].valid & mask))
- return 0;
-
- features[0].valid &= ~mask;
-
- do_set = !!(features[0].requested & mask);
-
- if (legacy_set(dev, do_set) < 0)
- netdev_info(dev,
- "Legacy feature change (%s) failed for 0x%08x\n",
- do_set ? "set" : "clear", mask);
-
- return 1;
-}
-
-static int ethtool_set_flags_compat(struct net_device *dev,
- int (*legacy_set)(struct net_device *, u32),
- struct ethtool_set_features_block *features, u32 mask)
-{
- u32 value;
-
- if (!legacy_set)
- return 0;
-
- if (!(features[0].valid & mask))
- return 0;
-
- value = dev->features & ~features[0].valid;
- value |= features[0].requested;
-
- features[0].valid &= ~mask;
-
- if (legacy_set(dev, value & mask) < 0)
- netdev_info(dev, "Legacy flags change failed\n");
-
- return 1;
-}
-
-static int ethtool_set_features_compat(struct net_device *dev,
- struct ethtool_set_features_block *features)
-{
- int compat;
-
- if (!dev->ethtool_ops)
- return 0;
-
- compat = ethtool_set_feature_compat(dev, dev->ethtool_ops->set_sg,
- features, NETIF_F_SG);
- compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_tx_csum,
- features, NETIF_F_ALL_CSUM);
- compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_tso,
- features, NETIF_F_ALL_TSO);
- compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_rx_csum,
- features, NETIF_F_RXCSUM);
- compat |= ethtool_set_flags_compat(dev, dev->ethtool_ops->set_flags,
- features, flags_dup_features);
-
- return compat;
-}
+#define ETHTOOL_DEV_FEATURE_WORDS ((NETDEV_FEATURE_COUNT + 31) / 32)
+
+static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = {
+ [NETIF_F_SG_BIT] = "tx-scatter-gather",
+ [NETIF_F_IP_CSUM_BIT] = "tx-checksum-ipv4",
+ [NETIF_F_HW_CSUM_BIT] = "tx-checksum-ip-generic",
+ [NETIF_F_IPV6_CSUM_BIT] = "tx-checksum-ipv6",
+ [NETIF_F_HIGHDMA_BIT] = "highdma",
+ [NETIF_F_FRAGLIST_BIT] = "tx-scatter-gather-fraglist",
+ [NETIF_F_HW_VLAN_TX_BIT] = "tx-vlan-hw-insert",
+
+ [NETIF_F_HW_VLAN_RX_BIT] = "rx-vlan-hw-parse",
+ [NETIF_F_HW_VLAN_FILTER_BIT] = "rx-vlan-filter",
+ [NETIF_F_VLAN_CHALLENGED_BIT] = "vlan-challenged",
+ [NETIF_F_GSO_BIT] = "tx-generic-segmentation",
+ [NETIF_F_LLTX_BIT] = "tx-lockless",
+ [NETIF_F_NETNS_LOCAL_BIT] = "netns-local",
+ [NETIF_F_GRO_BIT] = "rx-gro",
+ [NETIF_F_LRO_BIT] = "rx-lro",
+
+ [NETIF_F_TSO_BIT] = "tx-tcp-segmentation",
+ [NETIF_F_UFO_BIT] = "tx-udp-fragmentation",
+ [NETIF_F_GSO_ROBUST_BIT] = "tx-gso-robust",
+ [NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation",
+ [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation",
+ [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation",
+
+ [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc",
+ [NETIF_F_SCTP_CSUM_BIT] = "tx-checksum-sctp",
+ [NETIF_F_FCOE_MTU_BIT] = "fcoe-mtu",
+ [NETIF_F_NTUPLE_BIT] = "rx-ntuple-filter",
+ [NETIF_F_RXHASH_BIT] = "rx-hashing",
+ [NETIF_F_RXCSUM_BIT] = "rx-checksum",
+ [NETIF_F_NOCACHE_COPY_BIT] = "tx-nocache-copy",
+ [NETIF_F_LOOPBACK_BIT] = "loopback",
+};
static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
{
@@ -272,18 +81,21 @@ static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
.cmd = ETHTOOL_GFEATURES,
.size = ETHTOOL_DEV_FEATURE_WORDS,
};
- struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS] = {
- {
- .available = dev->hw_features,
- .requested = dev->wanted_features,
- .active = dev->features,
- .never_changed = NETIF_F_NEVER_CHANGE,
- },
- };
+ struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS];
u32 __user *sizeaddr;
u32 copy_size;
+ int i;
- ethtool_get_features_compat(dev, features);
+ /* in case feature bits run out again */
+ BUILD_BUG_ON(ETHTOOL_DEV_FEATURE_WORDS * sizeof(u32) > sizeof(netdev_features_t));
+
+ for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) {
+ features[i].available = (u32)(dev->hw_features >> (32 * i));
+ features[i].requested = (u32)(dev->wanted_features >> (32 * i));
+ features[i].active = (u32)(dev->features >> (32 * i));
+ features[i].never_changed =
+ (u32)(NETIF_F_NEVER_CHANGE >> (32 * i));
+ }
sizeaddr = useraddr + offsetof(struct ethtool_gfeatures, size);
if (get_user(copy_size, sizeaddr))
@@ -305,7 +117,8 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr)
{
struct ethtool_sfeatures cmd;
struct ethtool_set_features_block features[ETHTOOL_DEV_FEATURE_WORDS];
- int ret = 0;
+ netdev_features_t wanted = 0, valid = 0;
+ int i, ret = 0;
if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
return -EFAULT;
@@ -317,65 +130,29 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr)
if (copy_from_user(features, useraddr, sizeof(features)))
return -EFAULT;
- if (features[0].valid & ~NETIF_F_ETHTOOL_BITS)
- return -EINVAL;
+ for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) {
+ valid |= (netdev_features_t)features[i].valid << (32 * i);
+ wanted |= (netdev_features_t)features[i].requested << (32 * i);
+ }
- if (ethtool_set_features_compat(dev, features))
- ret |= ETHTOOL_F_COMPAT;
+ if (valid & ~NETIF_F_ETHTOOL_BITS)
+ return -EINVAL;
- if (features[0].valid & ~dev->hw_features) {
- features[0].valid &= dev->hw_features;
+ if (valid & ~dev->hw_features) {
+ valid &= dev->hw_features;
ret |= ETHTOOL_F_UNSUPPORTED;
}
- dev->wanted_features &= ~features[0].valid;
- dev->wanted_features |= features[0].valid & features[0].requested;
+ dev->wanted_features &= ~valid;
+ dev->wanted_features |= wanted & valid;
__netdev_update_features(dev);
- if ((dev->wanted_features ^ dev->features) & features[0].valid)
+ if ((dev->wanted_features ^ dev->features) & valid)
ret |= ETHTOOL_F_WISH;
return ret;
}
-static const char netdev_features_strings[ETHTOOL_DEV_FEATURE_WORDS * 32][ETH_GSTRING_LEN] = {
- /* NETIF_F_SG */ "tx-scatter-gather",
- /* NETIF_F_IP_CSUM */ "tx-checksum-ipv4",
- /* NETIF_F_NO_CSUM */ "tx-checksum-unneeded",
- /* NETIF_F_HW_CSUM */ "tx-checksum-ip-generic",
- /* NETIF_F_IPV6_CSUM */ "tx-checksum-ipv6",
- /* NETIF_F_HIGHDMA */ "highdma",
- /* NETIF_F_FRAGLIST */ "tx-scatter-gather-fraglist",
- /* NETIF_F_HW_VLAN_TX */ "tx-vlan-hw-insert",
-
- /* NETIF_F_HW_VLAN_RX */ "rx-vlan-hw-parse",
- /* NETIF_F_HW_VLAN_FILTER */ "rx-vlan-filter",
- /* NETIF_F_VLAN_CHALLENGED */ "vlan-challenged",
- /* NETIF_F_GSO */ "tx-generic-segmentation",
- /* NETIF_F_LLTX */ "tx-lockless",
- /* NETIF_F_NETNS_LOCAL */ "netns-local",
- /* NETIF_F_GRO */ "rx-gro",
- /* NETIF_F_LRO */ "rx-lro",
-
- /* NETIF_F_TSO */ "tx-tcp-segmentation",
- /* NETIF_F_UFO */ "tx-udp-fragmentation",
- /* NETIF_F_GSO_ROBUST */ "tx-gso-robust",
- /* NETIF_F_TSO_ECN */ "tx-tcp-ecn-segmentation",
- /* NETIF_F_TSO6 */ "tx-tcp6-segmentation",
- /* NETIF_F_FSO */ "tx-fcoe-segmentation",
- "",
- "",
-
- /* NETIF_F_FCOE_CRC */ "tx-checksum-fcoe-crc",
- /* NETIF_F_SCTP_CSUM */ "tx-checksum-sctp",
- /* NETIF_F_FCOE_MTU */ "fcoe-mtu",
- /* NETIF_F_NTUPLE */ "rx-ntuple-filter",
- /* NETIF_F_RXHASH */ "rx-hashing",
- /* NETIF_F_RXCSUM */ "rx-checksum",
- /* NETIF_F_NOCACHE_COPY */ "tx-nocache-copy",
- /* NETIF_F_LOOPBACK */ "loopback",
-};
-
static int __ethtool_get_sset_count(struct net_device *dev, int sset)
{
const struct ethtool_ops *ops = dev->ethtool_ops;
@@ -402,7 +179,7 @@ static void __ethtool_get_strings(struct net_device *dev,
ops->get_strings(dev, stringset, data);
}
-static u32 ethtool_get_feature_mask(u32 eth_cmd)
+static netdev_features_t ethtool_get_feature_mask(u32 eth_cmd)
{
/* feature masks of legacy discrete ethtool ops */
@@ -433,136 +210,82 @@ static u32 ethtool_get_feature_mask(u32 eth_cmd)
}
}
-static void *__ethtool_get_one_feature_actor(struct net_device *dev, u32 ethcmd)
-{
- const struct ethtool_ops *ops = dev->ethtool_ops;
-
- if (!ops)
- return NULL;
-
- switch (ethcmd) {
- case ETHTOOL_GTXCSUM:
- return ops->get_tx_csum;
- case ETHTOOL_GRXCSUM:
- return ops->get_rx_csum;
- case ETHTOOL_SSG:
- return ops->get_sg;
- case ETHTOOL_STSO:
- return ops->get_tso;
- case ETHTOOL_SUFO:
- return ops->get_ufo;
- default:
- return NULL;
- }
-}
-
-static u32 __ethtool_get_rx_csum_oldbug(struct net_device *dev)
-{
- return !!(dev->features & NETIF_F_ALL_CSUM);
-}
-
static int ethtool_get_one_feature(struct net_device *dev,
char __user *useraddr, u32 ethcmd)
{
- u32 mask = ethtool_get_feature_mask(ethcmd);
+ netdev_features_t mask = ethtool_get_feature_mask(ethcmd);
struct ethtool_value edata = {
.cmd = ethcmd,
.data = !!(dev->features & mask),
};
- /* compatibility with discrete get_ ops */
- if (!(dev->hw_features & mask)) {
- u32 (*actor)(struct net_device *);
-
- actor = __ethtool_get_one_feature_actor(dev, ethcmd);
-
- /* bug compatibility with old get_rx_csum */
- if (ethcmd == ETHTOOL_GRXCSUM && !actor)
- actor = __ethtool_get_rx_csum_oldbug;
-
- if (actor)
- edata.data = actor(dev);
- }
-
if (copy_to_user(useraddr, &edata, sizeof(edata)))
return -EFAULT;
return 0;
}
-static int __ethtool_set_tx_csum(struct net_device *dev, u32 data);
-static int __ethtool_set_rx_csum(struct net_device *dev, u32 data);
-static int __ethtool_set_sg(struct net_device *dev, u32 data);
-static int __ethtool_set_tso(struct net_device *dev, u32 data);
-static int __ethtool_set_ufo(struct net_device *dev, u32 data);
-
static int ethtool_set_one_feature(struct net_device *dev,
void __user *useraddr, u32 ethcmd)
{
struct ethtool_value edata;
- u32 mask;
+ netdev_features_t mask;
if (copy_from_user(&edata, useraddr, sizeof(edata)))
return -EFAULT;
mask = ethtool_get_feature_mask(ethcmd);
mask &= dev->hw_features;
- if (mask) {
- if (edata.data)
- dev->wanted_features |= mask;
- else
- dev->wanted_features &= ~mask;
+ if (!mask)
+ return -EOPNOTSUPP;
- __netdev_update_features(dev);
- return 0;
- }
+ if (edata.data)
+ dev->wanted_features |= mask;
+ else
+ dev->wanted_features &= ~mask;
- /* Driver is not converted to ndo_fix_features or does not
- * support changing this offload. In the latter case it won't
- * have corresponding ethtool_ops field set.
- *
- * Following part is to be removed after all drivers advertise
- * their changeable features in netdev->hw_features and stop
- * using discrete offload setting ops.
- */
+ __netdev_update_features(dev);
- switch (ethcmd) {
- case ETHTOOL_STXCSUM:
- return __ethtool_set_tx_csum(dev, edata.data);
- case ETHTOOL_SRXCSUM:
- return __ethtool_set_rx_csum(dev, edata.data);
- case ETHTOOL_SSG:
- return __ethtool_set_sg(dev, edata.data);
- case ETHTOOL_STSO:
- return __ethtool_set_tso(dev, edata.data);
- case ETHTOOL_SUFO:
- return __ethtool_set_ufo(dev, edata.data);
- default:
- return -EOPNOTSUPP;
- }
+ return 0;
+}
+
+#define ETH_ALL_FLAGS (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | \
+ ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH)
+#define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_RX | \
+ NETIF_F_HW_VLAN_TX | NETIF_F_NTUPLE | NETIF_F_RXHASH)
+
+static u32 __ethtool_get_flags(struct net_device *dev)
+{
+ u32 flags = 0;
+
+ if (dev->features & NETIF_F_LRO) flags |= ETH_FLAG_LRO;
+ if (dev->features & NETIF_F_HW_VLAN_RX) flags |= ETH_FLAG_RXVLAN;
+ if (dev->features & NETIF_F_HW_VLAN_TX) flags |= ETH_FLAG_TXVLAN;
+ if (dev->features & NETIF_F_NTUPLE) flags |= ETH_FLAG_NTUPLE;
+ if (dev->features & NETIF_F_RXHASH) flags |= ETH_FLAG_RXHASH;
+
+ return flags;
}
-int __ethtool_set_flags(struct net_device *dev, u32 data)
+static int __ethtool_set_flags(struct net_device *dev, u32 data)
{
- u32 changed;
+ netdev_features_t features = 0, changed;
- if (data & ~flags_dup_features)
+ if (data & ~ETH_ALL_FLAGS)
return -EINVAL;
- /* legacy set_flags() op */
- if (dev->ethtool_ops->set_flags) {
- if (unlikely(dev->hw_features & flags_dup_features))
- netdev_warn(dev,
- "driver BUG: mixed hw_features and set_flags()\n");
- return dev->ethtool_ops->set_flags(dev, data);
- }
+ if (data & ETH_FLAG_LRO) features |= NETIF_F_LRO;
+ if (data & ETH_FLAG_RXVLAN) features |= NETIF_F_HW_VLAN_RX;
+ if (data & ETH_FLAG_TXVLAN) features |= NETIF_F_HW_VLAN_TX;
+ if (data & ETH_FLAG_NTUPLE) features |= NETIF_F_NTUPLE;
+ if (data & ETH_FLAG_RXHASH) features |= NETIF_F_RXHASH;
/* allow changing only bits set in hw_features */
- changed = (data ^ dev->features) & flags_dup_features;
+ changed = (features ^ dev->features) & ETH_ALL_FEATURES;
if (changed & ~dev->hw_features)
return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP;
dev->wanted_features =
- (dev->wanted_features & ~changed) | (data & dev->hw_features);
+ (dev->wanted_features & ~changed) | (features & changed);
__netdev_update_features(dev);
@@ -716,6 +439,7 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev,
{
struct ethtool_rxnfc info;
size_t info_size = sizeof(info);
+ int rc;
if (!dev->ethtool_ops->set_rxnfc)
return -EOPNOTSUPP;
@@ -731,7 +455,15 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev,
if (copy_from_user(&info, useraddr, info_size))
return -EFAULT;
- return dev->ethtool_ops->set_rxnfc(dev, &info);
+ rc = dev->ethtool_ops->set_rxnfc(dev, &info);
+ if (rc)
+ return rc;
+
+ if (cmd == ETHTOOL_SRXCLSRLINS &&
+ copy_to_user(useraddr, &info, info_size))
+ return -EFAULT;
+
+ return 0;
}
static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev,
@@ -792,34 +524,44 @@ err_out:
static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
void __user *useraddr)
{
- struct ethtool_rxfh_indir *indir;
- u32 table_size;
- size_t full_size;
+ u32 user_size, dev_size;
+ u32 *indir;
int ret;
- if (!dev->ethtool_ops->get_rxfh_indir)
+ if (!dev->ethtool_ops->get_rxfh_indir_size ||
+ !dev->ethtool_ops->get_rxfh_indir)
+ return -EOPNOTSUPP;
+ dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev);
+ if (dev_size == 0)
return -EOPNOTSUPP;
- if (copy_from_user(&table_size,
+ if (copy_from_user(&user_size,
useraddr + offsetof(struct ethtool_rxfh_indir, size),
- sizeof(table_size)))
+ sizeof(user_size)))
return -EFAULT;
- if (table_size >
- (KMALLOC_MAX_SIZE - sizeof(*indir)) / sizeof(*indir->ring_index))
- return -ENOMEM;
- full_size = sizeof(*indir) + sizeof(*indir->ring_index) * table_size;
- indir = kzalloc(full_size, GFP_USER);
+ if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh_indir, size),
+ &dev_size, sizeof(dev_size)))
+ return -EFAULT;
+
+ /* If the user buffer size is 0, this is just a query for the
+ * device table size. Otherwise, if it's smaller than the
+ * device table size it's an error.
+ */
+ if (user_size < dev_size)
+ return user_size == 0 ? 0 : -EINVAL;
+
+ indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER);
if (!indir)
return -ENOMEM;
- indir->cmd = ETHTOOL_GRXFHINDIR;
- indir->size = table_size;
ret = dev->ethtool_ops->get_rxfh_indir(dev, indir);
if (ret)
goto out;
- if (copy_to_user(useraddr, indir, full_size))
+ if (copy_to_user(useraddr +
+ offsetof(struct ethtool_rxfh_indir, ring_index[0]),
+ indir, dev_size * sizeof(indir[0])))
ret = -EFAULT;
out:
@@ -830,30 +572,56 @@ out:
static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
void __user *useraddr)
{
- struct ethtool_rxfh_indir *indir;
- u32 table_size;
- size_t full_size;
+ struct ethtool_rxnfc rx_rings;
+ u32 user_size, dev_size, i;
+ u32 *indir;
int ret;
- if (!dev->ethtool_ops->set_rxfh_indir)
+ if (!dev->ethtool_ops->get_rxfh_indir_size ||
+ !dev->ethtool_ops->set_rxfh_indir ||
+ !dev->ethtool_ops->get_rxnfc)
+ return -EOPNOTSUPP;
+ dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev);
+ if (dev_size == 0)
return -EOPNOTSUPP;
- if (copy_from_user(&table_size,
+ if (copy_from_user(&user_size,
useraddr + offsetof(struct ethtool_rxfh_indir, size),
- sizeof(table_size)))
+ sizeof(user_size)))
return -EFAULT;
- if (table_size >
- (KMALLOC_MAX_SIZE - sizeof(*indir)) / sizeof(*indir->ring_index))
- return -ENOMEM;
- full_size = sizeof(*indir) + sizeof(*indir->ring_index) * table_size;
- indir = kmalloc(full_size, GFP_USER);
+ if (user_size != 0 && user_size != dev_size)
+ return -EINVAL;
+
+ indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER);
if (!indir)
return -ENOMEM;
- if (copy_from_user(indir, useraddr, full_size)) {
- ret = -EFAULT;
+ rx_rings.cmd = ETHTOOL_GRXRINGS;
+ ret = dev->ethtool_ops->get_rxnfc(dev, &rx_rings, NULL);
+ if (ret)
goto out;
+
+ if (user_size == 0) {
+ for (i = 0; i < dev_size; i++)
+ indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data);
+ } else {
+ if (copy_from_user(indir,
+ useraddr +
+ offsetof(struct ethtool_rxfh_indir,
+ ring_index[0]),
+ dev_size * sizeof(indir[0]))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /* Validate ring indices */
+ for (i = 0; i < dev_size; i++) {
+ if (indir[i] >= rx_rings.data) {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
}
ret = dev->ethtool_ops->set_rxfh_indir(dev, indir);
@@ -863,58 +631,6 @@ out:
return ret;
}
-/*
- * ethtool does not (or did not) set masks for flow parameters that are
- * not specified, so if both value and mask are 0 then this must be
- * treated as equivalent to a mask with all bits set. Implement that
- * here rather than in drivers.
- */
-static void rx_ntuple_fix_masks(struct ethtool_rx_ntuple_flow_spec *fs)
-{
- struct ethtool_tcpip4_spec *entry = &fs->h_u.tcp_ip4_spec;
- struct ethtool_tcpip4_spec *mask = &fs->m_u.tcp_ip4_spec;
-
- if (fs->flow_type != TCP_V4_FLOW &&
- fs->flow_type != UDP_V4_FLOW &&
- fs->flow_type != SCTP_V4_FLOW)
- return;
-
- if (!(entry->ip4src | mask->ip4src))
- mask->ip4src = htonl(0xffffffff);
- if (!(entry->ip4dst | mask->ip4dst))
- mask->ip4dst = htonl(0xffffffff);
- if (!(entry->psrc | mask->psrc))
- mask->psrc = htons(0xffff);
- if (!(entry->pdst | mask->pdst))
- mask->pdst = htons(0xffff);
- if (!(entry->tos | mask->tos))
- mask->tos = 0xff;
- if (!(fs->vlan_tag | fs->vlan_tag_mask))
- fs->vlan_tag_mask = 0xffff;
- if (!(fs->data | fs->data_mask))
- fs->data_mask = 0xffffffffffffffffULL;
-}
-
-static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev,
- void __user *useraddr)
-{
- struct ethtool_rx_ntuple cmd;
- const struct ethtool_ops *ops = dev->ethtool_ops;
-
- if (!ops->set_rx_ntuple)
- return -EOPNOTSUPP;
-
- if (!(dev->features & NETIF_F_NTUPLE))
- return -EINVAL;
-
- if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
- return -EFAULT;
-
- rx_ntuple_fix_masks(&cmd.fs);
-
- return ops->set_rx_ntuple(dev, &cmd);
-}
-
static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
{
struct ethtool_regs regs;
@@ -1231,81 +947,6 @@ static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr)
return dev->ethtool_ops->set_pauseparam(dev, &pauseparam);
}
-static int __ethtool_set_sg(struct net_device *dev, u32 data)
-{
- int err;
-
- if (!dev->ethtool_ops->set_sg)
- return -EOPNOTSUPP;
-
- if (data && !(dev->features & NETIF_F_ALL_CSUM))
- return -EINVAL;
-
- if (!data && dev->ethtool_ops->set_tso) {
- err = dev->ethtool_ops->set_tso(dev, 0);
- if (err)
- return err;
- }
-
- if (!data && dev->ethtool_ops->set_ufo) {
- err = dev->ethtool_ops->set_ufo(dev, 0);
- if (err)
- return err;
- }
- return dev->ethtool_ops->set_sg(dev, data);
-}
-
-static int __ethtool_set_tx_csum(struct net_device *dev, u32 data)
-{
- int err;
-
- if (!dev->ethtool_ops->set_tx_csum)
- return -EOPNOTSUPP;
-
- if (!data && dev->ethtool_ops->set_sg) {
- err = __ethtool_set_sg(dev, 0);
- if (err)
- return err;
- }
-
- return dev->ethtool_ops->set_tx_csum(dev, data);
-}
-
-static int __ethtool_set_rx_csum(struct net_device *dev, u32 data)
-{
- if (!dev->ethtool_ops->set_rx_csum)
- return -EOPNOTSUPP;
-
- if (!data)
- dev->features &= ~NETIF_F_GRO;
-
- return dev->ethtool_ops->set_rx_csum(dev, data);
-}
-
-static int __ethtool_set_tso(struct net_device *dev, u32 data)
-{
- if (!dev->ethtool_ops->set_tso)
- return -EOPNOTSUPP;
-
- if (data && !(dev->features & NETIF_F_SG))
- return -EINVAL;
-
- return dev->ethtool_ops->set_tso(dev, data);
-}
-
-static int __ethtool_set_ufo(struct net_device *dev, u32 data)
-{
- if (!dev->ethtool_ops->set_ufo)
- return -EOPNOTSUPP;
- if (data && !(dev->features & NETIF_F_SG))
- return -EINVAL;
- if (data && !((dev->features & NETIF_F_GEN_CSUM) ||
- (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
- == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)))
- return -EINVAL;
- return dev->ethtool_ops->set_ufo(dev, data);
-}
-
static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
{
struct ethtool_test test;
@@ -1771,9 +1412,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
break;
case ETHTOOL_GFLAGS:
rc = ethtool_get_value(dev, useraddr, ethcmd,
- (dev->ethtool_ops->get_flags ?
- dev->ethtool_ops->get_flags :
- ethtool_op_get_flags));
+ __ethtool_get_flags);
break;
case ETHTOOL_SFLAGS:
rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags);
@@ -1804,9 +1443,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_RESET:
rc = ethtool_reset(dev, useraddr);
break;
- case ETHTOOL_SRXNTUPLE:
- rc = ethtool_set_rx_ntuple(dev, useraddr);
- break;
case ETHTOOL_GSSET_INFO:
rc = ethtool_get_sset_info(dev, useraddr);
break;
diff --git a/net/core/flow.c b/net/core/flow.c
index 8ae42de9c79..e318c7e9804 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -358,6 +358,18 @@ void flow_cache_flush(void)
put_online_cpus();
}
+static void flow_cache_flush_task(struct work_struct *work)
+{
+ flow_cache_flush();
+}
+
+static DECLARE_WORK(flow_cache_flush_work, flow_cache_flush_task);
+
+void flow_cache_flush_deferred(void)
+{
+ schedule_work(&flow_cache_flush_work);
+}
+
static int __cpuinit flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
{
struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
new file mode 100644
index 00000000000..0985b9b14b8
--- /dev/null
+++ b/net/core/flow_dissector.c
@@ -0,0 +1,143 @@
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/if_vlan.h>
+#include <net/ip.h>
+#include <linux/if_tunnel.h>
+#include <linux/if_pppox.h>
+#include <linux/ppp_defs.h>
+#include <net/flow_keys.h>
+
+/* copy saddr & daddr, possibly using 64bit load/store
+ * Equivalent to : flow->src = iph->saddr;
+ * flow->dst = iph->daddr;
+ */
+static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *iph)
+{
+ BUILD_BUG_ON(offsetof(typeof(*flow), dst) !=
+ offsetof(typeof(*flow), src) + sizeof(flow->src));
+ memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst));
+}
+
+bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow)
+{
+ int poff, nhoff = skb_network_offset(skb);
+ u8 ip_proto;
+ __be16 proto = skb->protocol;
+
+ memset(flow, 0, sizeof(*flow));
+
+again:
+ switch (proto) {
+ case __constant_htons(ETH_P_IP): {
+ const struct iphdr *iph;
+ struct iphdr _iph;
+ip:
+ iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
+ if (!iph)
+ return false;
+
+ if (ip_is_fragment(iph))
+ ip_proto = 0;
+ else
+ ip_proto = iph->protocol;
+ iph_to_flow_copy_addrs(flow, iph);
+ nhoff += iph->ihl * 4;
+ break;
+ }
+ case __constant_htons(ETH_P_IPV6): {
+ const struct ipv6hdr *iph;
+ struct ipv6hdr _iph;
+ipv6:
+ iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
+ if (!iph)
+ return false;
+
+ ip_proto = iph->nexthdr;
+ flow->src = iph->saddr.s6_addr32[3];
+ flow->dst = iph->daddr.s6_addr32[3];
+ nhoff += sizeof(struct ipv6hdr);
+ break;
+ }
+ case __constant_htons(ETH_P_8021Q): {
+ const struct vlan_hdr *vlan;
+ struct vlan_hdr _vlan;
+
+ vlan = skb_header_pointer(skb, nhoff, sizeof(_vlan), &_vlan);
+ if (!vlan)
+ return false;
+
+ proto = vlan->h_vlan_encapsulated_proto;
+ nhoff += sizeof(*vlan);
+ goto again;
+ }
+ case __constant_htons(ETH_P_PPP_SES): {
+ struct {
+ struct pppoe_hdr hdr;
+ __be16 proto;
+ } *hdr, _hdr;
+ hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr);
+ if (!hdr)
+ return false;
+ proto = hdr->proto;
+ nhoff += PPPOE_SES_HLEN;
+ switch (proto) {
+ case __constant_htons(PPP_IP):
+ goto ip;
+ case __constant_htons(PPP_IPV6):
+ goto ipv6;
+ default:
+ return false;
+ }
+ }
+ default:
+ return false;
+ }
+
+ switch (ip_proto) {
+ case IPPROTO_GRE: {
+ struct gre_hdr {
+ __be16 flags;
+ __be16 proto;
+ } *hdr, _hdr;
+
+ hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr);
+ if (!hdr)
+ return false;
+ /*
+ * Only look inside GRE if version zero and no
+ * routing
+ */
+ if (!(hdr->flags & (GRE_VERSION|GRE_ROUTING))) {
+ proto = hdr->proto;
+ nhoff += 4;
+ if (hdr->flags & GRE_CSUM)
+ nhoff += 4;
+ if (hdr->flags & GRE_KEY)
+ nhoff += 4;
+ if (hdr->flags & GRE_SEQ)
+ nhoff += 4;
+ goto again;
+ }
+ break;
+ }
+ case IPPROTO_IPIP:
+ goto again;
+ default:
+ break;
+ }
+
+ flow->ip_proto = ip_proto;
+ poff = proto_ports_offset(ip_proto);
+ if (poff >= 0) {
+ __be32 *ports, _ports;
+
+ nhoff += poff;
+ ports = skb_header_pointer(skb, nhoff, sizeof(_ports), &_ports);
+ if (ports)
+ flow->ports = *ports;
+ }
+
+ return true;
+}
+EXPORT_SYMBOL(skb_flow_dissect);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 039d51e6c28..e287346e093 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -238,6 +238,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
it to safe state.
*/
skb_queue_purge(&n->arp_queue);
+ n->arp_queue_len_bytes = 0;
n->output = neigh_blackhole;
if (n->nud_state & NUD_VALID)
n->nud_state = NUD_NOARP;
@@ -272,7 +273,7 @@ int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
}
EXPORT_SYMBOL(neigh_ifdown);
-static struct neighbour *neigh_alloc(struct neigh_table *tbl)
+static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev)
{
struct neighbour *n = NULL;
unsigned long now = jiffies;
@@ -287,7 +288,15 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl)
goto out_entries;
}
- n = kmem_cache_zalloc(tbl->kmem_cachep, GFP_ATOMIC);
+ if (tbl->entry_size)
+ n = kzalloc(tbl->entry_size, GFP_ATOMIC);
+ else {
+ int sz = sizeof(*n) + tbl->key_len;
+
+ sz = ALIGN(sz, NEIGH_PRIV_ALIGN);
+ sz += dev->neigh_priv_len;
+ n = kzalloc(sz, GFP_ATOMIC);
+ }
if (!n)
goto out_entries;
@@ -313,11 +322,18 @@ out_entries:
goto out;
}
+static void neigh_get_hash_rnd(u32 *x)
+{
+ get_random_bytes(x, sizeof(*x));
+ *x |= 1;
+}
+
static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift)
{
size_t size = (1 << shift) * sizeof(struct neighbour *);
struct neigh_hash_table *ret;
struct neighbour __rcu **buckets;
+ int i;
ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
if (!ret)
@@ -334,8 +350,8 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift)
}
ret->hash_buckets = buckets;
ret->hash_shift = shift;
- get_random_bytes(&ret->hash_rnd, sizeof(ret->hash_rnd));
- ret->hash_rnd |= 1;
+ for (i = 0; i < NEIGH_NUM_HASH_RND; i++)
+ neigh_get_hash_rnd(&ret->hash_rnd[i]);
return ret;
}
@@ -462,7 +478,7 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
u32 hash_val;
int key_len = tbl->key_len;
int error;
- struct neighbour *n1, *rc, *n = neigh_alloc(tbl);
+ struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev);
struct neigh_hash_table *nht;
if (!n) {
@@ -480,6 +496,14 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
goto out_neigh_release;
}
+ if (dev->netdev_ops->ndo_neigh_construct) {
+ error = dev->netdev_ops->ndo_neigh_construct(n);
+ if (error < 0) {
+ rc = ERR_PTR(error);
+ goto out_neigh_release;
+ }
+ }
+
/* Device specific setup. */
if (n->parms->neigh_setup &&
(error = n->parms->neigh_setup(n)) < 0) {
@@ -677,18 +701,14 @@ static inline void neigh_parms_put(struct neigh_parms *parms)
neigh_parms_destroy(parms);
}
-static void neigh_destroy_rcu(struct rcu_head *head)
-{
- struct neighbour *neigh = container_of(head, struct neighbour, rcu);
-
- kmem_cache_free(neigh->tbl->kmem_cachep, neigh);
-}
/*
* neighbour must already be out of the table;
*
*/
void neigh_destroy(struct neighbour *neigh)
{
+ struct net_device *dev = neigh->dev;
+
NEIGH_CACHE_STAT_INC(neigh->tbl, destroys);
if (!neigh->dead) {
@@ -702,14 +722,18 @@ void neigh_destroy(struct neighbour *neigh)
printk(KERN_WARNING "Impossible event.\n");
skb_queue_purge(&neigh->arp_queue);
+ neigh->arp_queue_len_bytes = 0;
- dev_put(neigh->dev);
+ if (dev->netdev_ops->ndo_neigh_destroy)
+ dev->netdev_ops->ndo_neigh_destroy(neigh);
+
+ dev_put(dev);
neigh_parms_put(neigh->parms);
NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh);
atomic_dec(&neigh->tbl->entries);
- call_rcu(&neigh->rcu, neigh_destroy_rcu);
+ kfree_rcu(neigh, rcu);
}
EXPORT_SYMBOL(neigh_destroy);
@@ -842,6 +866,7 @@ static void neigh_invalidate(struct neighbour *neigh)
write_lock(&neigh->lock);
}
skb_queue_purge(&neigh->arp_queue);
+ neigh->arp_queue_len_bytes = 0;
}
static void neigh_probe(struct neighbour *neigh)
@@ -980,15 +1005,20 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
if (neigh->nud_state == NUD_INCOMPLETE) {
if (skb) {
- if (skb_queue_len(&neigh->arp_queue) >=
- neigh->parms->queue_len) {
+ while (neigh->arp_queue_len_bytes + skb->truesize >
+ neigh->parms->queue_len_bytes) {
struct sk_buff *buff;
+
buff = __skb_dequeue(&neigh->arp_queue);
+ if (!buff)
+ break;
+ neigh->arp_queue_len_bytes -= buff->truesize;
kfree_skb(buff);
NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);
}
skb_dst_force(skb);
__skb_queue_tail(&neigh->arp_queue, skb);
+ neigh->arp_queue_len_bytes += skb->truesize;
}
rc = 1;
}
@@ -1167,7 +1197,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
rcu_read_lock();
/* On shaper/eql skb->dst->neighbour != neigh :( */
- if (dst && (n2 = dst_get_neighbour(dst)) != NULL)
+ if (dst && (n2 = dst_get_neighbour_noref(dst)) != NULL)
n1 = n2;
n1->output(n1, skb);
rcu_read_unlock();
@@ -1175,6 +1205,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
write_lock_bh(&neigh->lock);
}
skb_queue_purge(&neigh->arp_queue);
+ neigh->arp_queue_len_bytes = 0;
}
out:
if (update_isrouter) {
@@ -1477,11 +1508,6 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl)
tbl->parms.reachable_time =
neigh_rand_reach_time(tbl->parms.base_reachable_time);
- if (!tbl->kmem_cachep)
- tbl->kmem_cachep =
- kmem_cache_create(tbl->id, tbl->entry_size, 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC,
- NULL);
tbl->stats = alloc_percpu(struct neigh_statistics);
if (!tbl->stats)
panic("cannot create neighbour cache statistics");
@@ -1566,9 +1592,6 @@ int neigh_table_clear(struct neigh_table *tbl)
free_percpu(tbl->stats);
tbl->stats = NULL;
- kmem_cache_destroy(tbl->kmem_cachep);
- tbl->kmem_cachep = NULL;
-
return 0;
}
EXPORT_SYMBOL(neigh_table_clear);
@@ -1747,7 +1770,11 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
NLA_PUT_U32(skb, NDTPA_IFINDEX, parms->dev->ifindex);
NLA_PUT_U32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt));
- NLA_PUT_U32(skb, NDTPA_QUEUE_LEN, parms->queue_len);
+ NLA_PUT_U32(skb, NDTPA_QUEUE_LENBYTES, parms->queue_len_bytes);
+ /* approximative value for deprecated QUEUE_LEN (in packets) */
+ NLA_PUT_U32(skb, NDTPA_QUEUE_LEN,
+ DIV_ROUND_UP(parms->queue_len_bytes,
+ SKB_TRUESIZE(ETH_FRAME_LEN)));
NLA_PUT_U32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen);
NLA_PUT_U32(skb, NDTPA_APP_PROBES, parms->app_probes);
NLA_PUT_U32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes);
@@ -1808,7 +1835,7 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
rcu_read_lock_bh();
nht = rcu_dereference_bh(tbl->nht);
- ndc.ndtc_hash_rnd = nht->hash_rnd;
+ ndc.ndtc_hash_rnd = nht->hash_rnd[0];
ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1);
rcu_read_unlock_bh();
@@ -1974,7 +2001,11 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
switch (i) {
case NDTPA_QUEUE_LEN:
- p->queue_len = nla_get_u32(tbp[i]);
+ p->queue_len_bytes = nla_get_u32(tbp[i]) *
+ SKB_TRUESIZE(ETH_FRAME_LEN);
+ break;
+ case NDTPA_QUEUE_LENBYTES:
+ p->queue_len_bytes = nla_get_u32(tbp[i]);
break;
case NDTPA_PROXY_QLEN:
p->proxy_qlen = nla_get_u32(tbp[i]);
@@ -2397,7 +2428,10 @@ static struct pneigh_entry *pneigh_get_next(struct seq_file *seq,
struct net *net = seq_file_net(seq);
struct neigh_table *tbl = state->tbl;
- pn = pn->next;
+ do {
+ pn = pn->next;
+ } while (pn && !net_eq(pneigh_net(pn), net));
+
while (!pn) {
if (++state->bucket > PNEIGH_HASHMASK)
break;
@@ -2635,117 +2669,158 @@ EXPORT_SYMBOL(neigh_app_ns);
#ifdef CONFIG_SYSCTL
-#define NEIGH_VARS_MAX 19
+static int proc_unres_qlen(ctl_table *ctl, int write, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int size, ret;
+ ctl_table tmp = *ctl;
+
+ tmp.data = &size;
+ size = DIV_ROUND_UP(*(int *)ctl->data, SKB_TRUESIZE(ETH_FRAME_LEN));
+ ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
+ if (write && !ret)
+ *(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN);
+ return ret;
+}
+
+enum {
+ NEIGH_VAR_MCAST_PROBE,
+ NEIGH_VAR_UCAST_PROBE,
+ NEIGH_VAR_APP_PROBE,
+ NEIGH_VAR_RETRANS_TIME,
+ NEIGH_VAR_BASE_REACHABLE_TIME,
+ NEIGH_VAR_DELAY_PROBE_TIME,
+ NEIGH_VAR_GC_STALETIME,
+ NEIGH_VAR_QUEUE_LEN,
+ NEIGH_VAR_QUEUE_LEN_BYTES,
+ NEIGH_VAR_PROXY_QLEN,
+ NEIGH_VAR_ANYCAST_DELAY,
+ NEIGH_VAR_PROXY_DELAY,
+ NEIGH_VAR_LOCKTIME,
+ NEIGH_VAR_RETRANS_TIME_MS,
+ NEIGH_VAR_BASE_REACHABLE_TIME_MS,
+ NEIGH_VAR_GC_INTERVAL,
+ NEIGH_VAR_GC_THRESH1,
+ NEIGH_VAR_GC_THRESH2,
+ NEIGH_VAR_GC_THRESH3,
+ NEIGH_VAR_MAX
+};
static struct neigh_sysctl_table {
struct ctl_table_header *sysctl_header;
- struct ctl_table neigh_vars[NEIGH_VARS_MAX];
+ struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1];
char *dev_name;
} neigh_sysctl_template __read_mostly = {
.neigh_vars = {
- {
+ [NEIGH_VAR_MCAST_PROBE] = {
.procname = "mcast_solicit",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {
+ [NEIGH_VAR_UCAST_PROBE] = {
.procname = "ucast_solicit",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {
+ [NEIGH_VAR_APP_PROBE] = {
.procname = "app_solicit",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {
+ [NEIGH_VAR_RETRANS_TIME] = {
.procname = "retrans_time",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_userhz_jiffies,
},
- {
+ [NEIGH_VAR_BASE_REACHABLE_TIME] = {
.procname = "base_reachable_time",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- {
+ [NEIGH_VAR_DELAY_PROBE_TIME] = {
.procname = "delay_first_probe_time",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- {
+ [NEIGH_VAR_GC_STALETIME] = {
.procname = "gc_stale_time",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- {
+ [NEIGH_VAR_QUEUE_LEN] = {
.procname = "unres_qlen",
.maxlen = sizeof(int),
.mode = 0644,
+ .proc_handler = proc_unres_qlen,
+ },
+ [NEIGH_VAR_QUEUE_LEN_BYTES] = {
+ .procname = "unres_qlen_bytes",
+ .maxlen = sizeof(int),
+ .mode = 0644,
.proc_handler = proc_dointvec,
},
- {
+ [NEIGH_VAR_PROXY_QLEN] = {
.procname = "proxy_qlen",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {
+ [NEIGH_VAR_ANYCAST_DELAY] = {
.procname = "anycast_delay",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_userhz_jiffies,
},
- {
+ [NEIGH_VAR_PROXY_DELAY] = {
.procname = "proxy_delay",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_userhz_jiffies,
},
- {
+ [NEIGH_VAR_LOCKTIME] = {
.procname = "locktime",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_userhz_jiffies,
},
- {
+ [NEIGH_VAR_RETRANS_TIME_MS] = {
.procname = "retrans_time_ms",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_ms_jiffies,
},
- {
+ [NEIGH_VAR_BASE_REACHABLE_TIME_MS] = {
.procname = "base_reachable_time_ms",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_ms_jiffies,
},
- {
+ [NEIGH_VAR_GC_INTERVAL] = {
.procname = "gc_interval",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- {
+ [NEIGH_VAR_GC_THRESH1] = {
.procname = "gc_thresh1",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {
+ [NEIGH_VAR_GC_THRESH2] = {
.procname = "gc_thresh2",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {
+ [NEIGH_VAR_GC_THRESH3] = {
.procname = "gc_thresh3",
.maxlen = sizeof(int),
.mode = 0644,
@@ -2778,47 +2853,49 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
if (!t)
goto err;
- t->neigh_vars[0].data = &p->mcast_probes;
- t->neigh_vars[1].data = &p->ucast_probes;
- t->neigh_vars[2].data = &p->app_probes;
- t->neigh_vars[3].data = &p->retrans_time;
- t->neigh_vars[4].data = &p->base_reachable_time;
- t->neigh_vars[5].data = &p->delay_probe_time;
- t->neigh_vars[6].data = &p->gc_staletime;
- t->neigh_vars[7].data = &p->queue_len;
- t->neigh_vars[8].data = &p->proxy_qlen;
- t->neigh_vars[9].data = &p->anycast_delay;
- t->neigh_vars[10].data = &p->proxy_delay;
- t->neigh_vars[11].data = &p->locktime;
- t->neigh_vars[12].data = &p->retrans_time;
- t->neigh_vars[13].data = &p->base_reachable_time;
+ t->neigh_vars[NEIGH_VAR_MCAST_PROBE].data = &p->mcast_probes;
+ t->neigh_vars[NEIGH_VAR_UCAST_PROBE].data = &p->ucast_probes;
+ t->neigh_vars[NEIGH_VAR_APP_PROBE].data = &p->app_probes;
+ t->neigh_vars[NEIGH_VAR_RETRANS_TIME].data = &p->retrans_time;
+ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].data = &p->base_reachable_time;
+ t->neigh_vars[NEIGH_VAR_DELAY_PROBE_TIME].data = &p->delay_probe_time;
+ t->neigh_vars[NEIGH_VAR_GC_STALETIME].data = &p->gc_staletime;
+ t->neigh_vars[NEIGH_VAR_QUEUE_LEN].data = &p->queue_len_bytes;
+ t->neigh_vars[NEIGH_VAR_QUEUE_LEN_BYTES].data = &p->queue_len_bytes;
+ t->neigh_vars[NEIGH_VAR_PROXY_QLEN].data = &p->proxy_qlen;
+ t->neigh_vars[NEIGH_VAR_ANYCAST_DELAY].data = &p->anycast_delay;
+ t->neigh_vars[NEIGH_VAR_PROXY_DELAY].data = &p->proxy_delay;
+ t->neigh_vars[NEIGH_VAR_LOCKTIME].data = &p->locktime;
+ t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].data = &p->retrans_time;
+ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].data = &p->base_reachable_time;
if (dev) {
dev_name_source = dev->name;
/* Terminate the table early */
- memset(&t->neigh_vars[14], 0, sizeof(t->neigh_vars[14]));
+ memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0,
+ sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL]));
} else {
dev_name_source = neigh_path[NEIGH_CTL_PATH_DEV].procname;
- t->neigh_vars[14].data = (int *)(p + 1);
- t->neigh_vars[15].data = (int *)(p + 1) + 1;
- t->neigh_vars[16].data = (int *)(p + 1) + 2;
- t->neigh_vars[17].data = (int *)(p + 1) + 3;
+ t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = (int *)(p + 1);
+ t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = (int *)(p + 1) + 1;
+ t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = (int *)(p + 1) + 2;
+ t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = (int *)(p + 1) + 3;
}
if (handler) {
/* RetransTime */
- t->neigh_vars[3].proc_handler = handler;
- t->neigh_vars[3].extra1 = dev;
+ t->neigh_vars[NEIGH_VAR_RETRANS_TIME].proc_handler = handler;
+ t->neigh_vars[NEIGH_VAR_RETRANS_TIME].extra1 = dev;
/* ReachableTime */
- t->neigh_vars[4].proc_handler = handler;
- t->neigh_vars[4].extra1 = dev;
+ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = handler;
+ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].extra1 = dev;
/* RetransTime (in milliseconds)*/
- t->neigh_vars[12].proc_handler = handler;
- t->neigh_vars[12].extra1 = dev;
+ t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].proc_handler = handler;
+ t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].extra1 = dev;
/* ReachableTime (in milliseconds) */
- t->neigh_vars[13].proc_handler = handler;
- t->neigh_vars[13].extra1 = dev;
+ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = handler;
+ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].extra1 = dev;
}
t->dev_name = kstrdup(dev_name_source, GFP_KERNEL);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index c71c434a4c0..abf4393a77b 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -21,6 +21,7 @@
#include <linux/wireless.h>
#include <linux/vmalloc.h>
#include <linux/export.h>
+#include <linux/jiffies.h>
#include <net/wext.h>
#include "net-sysfs.h"
@@ -606,9 +607,12 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue,
rcu_assign_pointer(queue->rps_map, map);
spin_unlock(&rps_map_lock);
- if (old_map)
+ if (map)
+ jump_label_inc(&rps_needed);
+ if (old_map) {
kfree_rcu(old_map, rcu);
-
+ jump_label_dec(&rps_needed);
+ }
free_cpumask_var(mask);
return len;
}
@@ -618,15 +622,15 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
char *buf)
{
struct rps_dev_flow_table *flow_table;
- unsigned int val = 0;
+ unsigned long val = 0;
rcu_read_lock();
flow_table = rcu_dereference(queue->rps_flow_table);
if (flow_table)
- val = flow_table->mask + 1;
+ val = (unsigned long)flow_table->mask + 1;
rcu_read_unlock();
- return sprintf(buf, "%u\n", val);
+ return sprintf(buf, "%lu\n", val);
}
static void rps_dev_flow_table_release_work(struct work_struct *work)
@@ -650,33 +654,46 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
struct rx_queue_attribute *attr,
const char *buf, size_t len)
{
- unsigned int count;
- char *endp;
+ unsigned long mask, count;
struct rps_dev_flow_table *table, *old_table;
static DEFINE_SPINLOCK(rps_dev_flow_lock);
+ int rc;
if (!capable(CAP_NET_ADMIN))
return -EPERM;
- count = simple_strtoul(buf, &endp, 0);
- if (endp == buf)
- return -EINVAL;
+ rc = kstrtoul(buf, 0, &count);
+ if (rc < 0)
+ return rc;
if (count) {
- int i;
-
- if (count > 1<<30) {
+ mask = count - 1;
+ /* mask = roundup_pow_of_two(count) - 1;
+ * without overflows...
+ */
+ while ((mask | (mask >> 1)) != mask)
+ mask |= (mask >> 1);
+ /* On 64 bit arches, must check mask fits in table->mask (u32),
+ * and on 32bit arches, must check RPS_DEV_FLOW_TABLE_SIZE(mask + 1)
+ * doesnt overflow.
+ */
+#if BITS_PER_LONG > 32
+ if (mask > (unsigned long)(u32)mask)
+ return -EINVAL;
+#else
+ if (mask > (ULONG_MAX - RPS_DEV_FLOW_TABLE_SIZE(1))
+ / sizeof(struct rps_dev_flow)) {
/* Enforce a limit to prevent overflow */
return -EINVAL;
}
- count = roundup_pow_of_two(count);
- table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(count));
+#endif
+ table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(mask + 1));
if (!table)
return -ENOMEM;
- table->mask = count - 1;
- for (i = 0; i < count; i++)
- table->flows[i].cpu = RPS_NO_CPU;
+ table->mask = mask;
+ for (count = 0; count <= mask; count++)
+ table->flows[count].cpu = RPS_NO_CPU;
} else
table = NULL;
@@ -780,7 +797,7 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
#endif
}
-#ifdef CONFIG_XPS
+#ifdef CONFIG_SYSFS
/*
* netdev_queue sysfs structures and functions.
*/
@@ -826,6 +843,133 @@ static const struct sysfs_ops netdev_queue_sysfs_ops = {
.store = netdev_queue_attr_store,
};
+static ssize_t show_trans_timeout(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attribute,
+ char *buf)
+{
+ unsigned long trans_timeout;
+
+ spin_lock_irq(&queue->_xmit_lock);
+ trans_timeout = queue->trans_timeout;
+ spin_unlock_irq(&queue->_xmit_lock);
+
+ return sprintf(buf, "%lu", trans_timeout);
+}
+
+static struct netdev_queue_attribute queue_trans_timeout =
+ __ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL);
+
+#ifdef CONFIG_BQL
+/*
+ * Byte queue limits sysfs structures and functions.
+ */
+static ssize_t bql_show(char *buf, unsigned int value)
+{
+ return sprintf(buf, "%u\n", value);
+}
+
+static ssize_t bql_set(const char *buf, const size_t count,
+ unsigned int *pvalue)
+{
+ unsigned int value;
+ int err;
+
+ if (!strcmp(buf, "max") || !strcmp(buf, "max\n"))
+ value = DQL_MAX_LIMIT;
+ else {
+ err = kstrtouint(buf, 10, &value);
+ if (err < 0)
+ return err;
+ if (value > DQL_MAX_LIMIT)
+ return -EINVAL;
+ }
+
+ *pvalue = value;
+
+ return count;
+}
+
+static ssize_t bql_show_hold_time(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attr,
+ char *buf)
+{
+ struct dql *dql = &queue->dql;
+
+ return sprintf(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time));
+}
+
+static ssize_t bql_set_hold_time(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attribute,
+ const char *buf, size_t len)
+{
+ struct dql *dql = &queue->dql;
+ unsigned value;
+ int err;
+
+ err = kstrtouint(buf, 10, &value);
+ if (err < 0)
+ return err;
+
+ dql->slack_hold_time = msecs_to_jiffies(value);
+
+ return len;
+}
+
+static struct netdev_queue_attribute bql_hold_time_attribute =
+ __ATTR(hold_time, S_IRUGO | S_IWUSR, bql_show_hold_time,
+ bql_set_hold_time);
+
+static ssize_t bql_show_inflight(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attr,
+ char *buf)
+{
+ struct dql *dql = &queue->dql;
+
+ return sprintf(buf, "%u\n", dql->num_queued - dql->num_completed);
+}
+
+static struct netdev_queue_attribute bql_inflight_attribute =
+ __ATTR(inflight, S_IRUGO | S_IWUSR, bql_show_inflight, NULL);
+
+#define BQL_ATTR(NAME, FIELD) \
+static ssize_t bql_show_ ## NAME(struct netdev_queue *queue, \
+ struct netdev_queue_attribute *attr, \
+ char *buf) \
+{ \
+ return bql_show(buf, queue->dql.FIELD); \
+} \
+ \
+static ssize_t bql_set_ ## NAME(struct netdev_queue *queue, \
+ struct netdev_queue_attribute *attr, \
+ const char *buf, size_t len) \
+{ \
+ return bql_set(buf, len, &queue->dql.FIELD); \
+} \
+ \
+static struct netdev_queue_attribute bql_ ## NAME ## _attribute = \
+ __ATTR(NAME, S_IRUGO | S_IWUSR, bql_show_ ## NAME, \
+ bql_set_ ## NAME);
+
+BQL_ATTR(limit, limit)
+BQL_ATTR(limit_max, max_limit)
+BQL_ATTR(limit_min, min_limit)
+
+static struct attribute *dql_attrs[] = {
+ &bql_limit_attribute.attr,
+ &bql_limit_max_attribute.attr,
+ &bql_limit_min_attribute.attr,
+ &bql_hold_time_attribute.attr,
+ &bql_inflight_attribute.attr,
+ NULL
+};
+
+static struct attribute_group dql_group = {
+ .name = "byte_queue_limits",
+ .attrs = dql_attrs,
+};
+#endif /* CONFIG_BQL */
+
+#ifdef CONFIG_XPS
static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue)
{
struct net_device *dev = queue->dev;
@@ -890,6 +1034,52 @@ static DEFINE_MUTEX(xps_map_mutex);
#define xmap_dereference(P) \
rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
+static void xps_queue_release(struct netdev_queue *queue)
+{
+ struct net_device *dev = queue->dev;
+ struct xps_dev_maps *dev_maps;
+ struct xps_map *map;
+ unsigned long index;
+ int i, pos, nonempty = 0;
+
+ index = get_netdev_queue_index(queue);
+
+ mutex_lock(&xps_map_mutex);
+ dev_maps = xmap_dereference(dev->xps_maps);
+
+ if (dev_maps) {
+ for_each_possible_cpu(i) {
+ map = xmap_dereference(dev_maps->cpu_map[i]);
+ if (!map)
+ continue;
+
+ for (pos = 0; pos < map->len; pos++)
+ if (map->queues[pos] == index)
+ break;
+
+ if (pos < map->len) {
+ if (map->len > 1)
+ map->queues[pos] =
+ map->queues[--map->len];
+ else {
+ RCU_INIT_POINTER(dev_maps->cpu_map[i],
+ NULL);
+ kfree_rcu(map, rcu);
+ map = NULL;
+ }
+ }
+ if (map)
+ nonempty = 1;
+ }
+
+ if (!nonempty) {
+ RCU_INIT_POINTER(dev->xps_maps, NULL);
+ kfree_rcu(dev_maps, rcu);
+ }
+ }
+ mutex_unlock(&xps_map_mutex);
+}
+
static ssize_t store_xps_map(struct netdev_queue *queue,
struct netdev_queue_attribute *attribute,
const char *buf, size_t len)
@@ -901,7 +1091,7 @@ static ssize_t store_xps_map(struct netdev_queue *queue,
struct xps_map *map, *new_map;
struct xps_dev_maps *dev_maps, *new_dev_maps;
int nonempty = 0;
- int numa_node = -2;
+ int numa_node_id = -2;
if (!capable(CAP_NET_ADMIN))
return -EPERM;
@@ -944,10 +1134,10 @@ static ssize_t store_xps_map(struct netdev_queue *queue,
need_set = cpumask_test_cpu(cpu, mask) && cpu_online(cpu);
#ifdef CONFIG_NUMA
if (need_set) {
- if (numa_node == -2)
- numa_node = cpu_to_node(cpu);
- else if (numa_node != cpu_to_node(cpu))
- numa_node = -1;
+ if (numa_node_id == -2)
+ numa_node_id = cpu_to_node(cpu);
+ else if (numa_node_id != cpu_to_node(cpu))
+ numa_node_id = -1;
}
#endif
if (need_set && pos >= map_len) {
@@ -997,7 +1187,7 @@ static ssize_t store_xps_map(struct netdev_queue *queue,
if (dev_maps)
kfree_rcu(dev_maps, rcu);
- netdev_queue_numa_node_write(queue, (numa_node >= 0) ? numa_node :
+ netdev_queue_numa_node_write(queue, (numa_node_id >= 0) ? numa_node_id :
NUMA_NO_NODE);
mutex_unlock(&xps_map_mutex);
@@ -1020,58 +1210,23 @@ error:
static struct netdev_queue_attribute xps_cpus_attribute =
__ATTR(xps_cpus, S_IRUGO | S_IWUSR, show_xps_map, store_xps_map);
+#endif /* CONFIG_XPS */
static struct attribute *netdev_queue_default_attrs[] = {
+ &queue_trans_timeout.attr,
+#ifdef CONFIG_XPS
&xps_cpus_attribute.attr,
+#endif
NULL
};
static void netdev_queue_release(struct kobject *kobj)
{
struct netdev_queue *queue = to_netdev_queue(kobj);
- struct net_device *dev = queue->dev;
- struct xps_dev_maps *dev_maps;
- struct xps_map *map;
- unsigned long index;
- int i, pos, nonempty = 0;
-
- index = get_netdev_queue_index(queue);
-
- mutex_lock(&xps_map_mutex);
- dev_maps = xmap_dereference(dev->xps_maps);
-
- if (dev_maps) {
- for_each_possible_cpu(i) {
- map = xmap_dereference(dev_maps->cpu_map[i]);
- if (!map)
- continue;
- for (pos = 0; pos < map->len; pos++)
- if (map->queues[pos] == index)
- break;
-
- if (pos < map->len) {
- if (map->len > 1)
- map->queues[pos] =
- map->queues[--map->len];
- else {
- RCU_INIT_POINTER(dev_maps->cpu_map[i],
- NULL);
- kfree_rcu(map, rcu);
- map = NULL;
- }
- }
- if (map)
- nonempty = 1;
- }
-
- if (!nonempty) {
- RCU_INIT_POINTER(dev->xps_maps, NULL);
- kfree_rcu(dev_maps, rcu);
- }
- }
-
- mutex_unlock(&xps_map_mutex);
+#ifdef CONFIG_XPS
+ xps_queue_release(queue);
+#endif
memset(kobj, 0, sizeof(*kobj));
dev_put(queue->dev);
@@ -1092,22 +1247,29 @@ static int netdev_queue_add_kobject(struct net_device *net, int index)
kobj->kset = net->queues_kset;
error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL,
"tx-%u", index);
- if (error) {
- kobject_put(kobj);
- return error;
- }
+ if (error)
+ goto exit;
+
+#ifdef CONFIG_BQL
+ error = sysfs_create_group(kobj, &dql_group);
+ if (error)
+ goto exit;
+#endif
kobject_uevent(kobj, KOBJ_ADD);
dev_hold(queue->dev);
+ return 0;
+exit:
+ kobject_put(kobj);
return error;
}
-#endif /* CONFIG_XPS */
+#endif /* CONFIG_SYSFS */
int
netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
{
-#ifdef CONFIG_XPS
+#ifdef CONFIG_SYSFS
int i;
int error = 0;
@@ -1119,20 +1281,26 @@ netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
}
}
- while (--i >= new_num)
- kobject_put(&net->_tx[i].kobj);
+ while (--i >= new_num) {
+ struct netdev_queue *queue = net->_tx + i;
+
+#ifdef CONFIG_BQL
+ sysfs_remove_group(&queue->kobj, &dql_group);
+#endif
+ kobject_put(&queue->kobj);
+ }
return error;
#else
return 0;
-#endif
+#endif /* CONFIG_SYSFS */
}
static int register_queue_kobjects(struct net_device *net)
{
int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0;
-#if defined(CONFIG_RPS) || defined(CONFIG_XPS)
+#ifdef CONFIG_SYSFS
net->queues_kset = kset_create_and_add("queues",
NULL, &net->dev.kobj);
if (!net->queues_kset)
@@ -1173,7 +1341,7 @@ static void remove_queue_kobjects(struct net_device *net)
net_rx_queue_update_kobjects(net, real_rx, 0);
netdev_queue_update_kobjects(net, real_tx, 0);
-#if defined(CONFIG_RPS) || defined(CONFIG_XPS)
+#ifdef CONFIG_SYSFS
kset_unregister(net->queues_kset);
#endif
}
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index cf64c1ffa4c..0d38808a230 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -76,7 +76,7 @@ static void queue_process(struct work_struct *work)
local_irq_save(flags);
__netif_tx_lock(txq, smp_processor_id());
- if (netif_tx_queue_frozen_or_stopped(txq) ||
+ if (netif_xmit_frozen_or_stopped(txq) ||
ops->ndo_start_xmit(skb, dev) != NETDEV_TX_OK) {
skb_queue_head(&npinfo->txq, skb);
__netif_tx_unlock(txq);
@@ -317,7 +317,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;
tries > 0; --tries) {
if (__netif_tx_trylock(txq)) {
- if (!netif_tx_queue_stopped(txq)) {
+ if (!netif_xmit_stopped(txq)) {
status = ops->ndo_start_xmit(skb, dev);
if (status == NETDEV_TX_OK)
txq_trans_update(txq);
@@ -422,6 +422,7 @@ static void arp_reply(struct sk_buff *skb)
struct sk_buff *send_skb;
struct netpoll *np, *tmp;
unsigned long flags;
+ int hlen, tlen;
int hits = 0;
if (list_empty(&npinfo->rx_np))
@@ -479,8 +480,9 @@ static void arp_reply(struct sk_buff *skb)
if (tip != np->local_ip)
continue;
- send_skb = find_skb(np, size + LL_ALLOCATED_SPACE(np->dev),
- LL_RESERVED_SPACE(np->dev));
+ hlen = LL_RESERVED_SPACE(np->dev);
+ tlen = np->dev->needed_tailroom;
+ send_skb = find_skb(np, size + hlen + tlen, hlen);
if (!send_skb)
continue;
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
new file mode 100644
index 00000000000..3a9fd4826b7
--- /dev/null
+++ b/net/core/netprio_cgroup.c
@@ -0,0 +1,344 @@
+/*
+ * net/core/netprio_cgroup.c Priority Control Group
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Neil Horman <nhorman@tuxdriver.com>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/cgroup.h>
+#include <linux/rcupdate.h>
+#include <linux/atomic.h>
+#include <net/rtnetlink.h>
+#include <net/pkt_cls.h>
+#include <net/sock.h>
+#include <net/netprio_cgroup.h>
+
+static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
+ struct cgroup *cgrp);
+static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
+static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp);
+
+struct cgroup_subsys net_prio_subsys = {
+ .name = "net_prio",
+ .create = cgrp_create,
+ .destroy = cgrp_destroy,
+ .populate = cgrp_populate,
+#ifdef CONFIG_NETPRIO_CGROUP
+ .subsys_id = net_prio_subsys_id,
+#endif
+ .module = THIS_MODULE
+};
+
+#define PRIOIDX_SZ 128
+
+static unsigned long prioidx_map[PRIOIDX_SZ];
+static DEFINE_SPINLOCK(prioidx_map_lock);
+static atomic_t max_prioidx = ATOMIC_INIT(0);
+
+static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgrp)
+{
+ return container_of(cgroup_subsys_state(cgrp, net_prio_subsys_id),
+ struct cgroup_netprio_state, css);
+}
+
+static int get_prioidx(u32 *prio)
+{
+ unsigned long flags;
+ u32 prioidx;
+
+ spin_lock_irqsave(&prioidx_map_lock, flags);
+ prioidx = find_first_zero_bit(prioidx_map, sizeof(unsigned long) * PRIOIDX_SZ);
+ set_bit(prioidx, prioidx_map);
+ spin_unlock_irqrestore(&prioidx_map_lock, flags);
+ if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ)
+ return -ENOSPC;
+
+ atomic_set(&max_prioidx, prioidx);
+ *prio = prioidx;
+ return 0;
+}
+
+static void put_prioidx(u32 idx)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&prioidx_map_lock, flags);
+ clear_bit(idx, prioidx_map);
+ spin_unlock_irqrestore(&prioidx_map_lock, flags);
+}
+
+static void extend_netdev_table(struct net_device *dev, u32 new_len)
+{
+ size_t new_size = sizeof(struct netprio_map) +
+ ((sizeof(u32) * new_len));
+ struct netprio_map *new_priomap = kzalloc(new_size, GFP_KERNEL);
+ struct netprio_map *old_priomap;
+ int i;
+
+ old_priomap = rtnl_dereference(dev->priomap);
+
+ if (!new_priomap) {
+ printk(KERN_WARNING "Unable to alloc new priomap!\n");
+ return;
+ }
+
+ for (i = 0;
+ old_priomap && (i < old_priomap->priomap_len);
+ i++)
+ new_priomap->priomap[i] = old_priomap->priomap[i];
+
+ new_priomap->priomap_len = new_len;
+
+ rcu_assign_pointer(dev->priomap, new_priomap);
+ if (old_priomap)
+ kfree_rcu(old_priomap, rcu);
+}
+
+static void update_netdev_tables(void)
+{
+ struct net_device *dev;
+ u32 max_len = atomic_read(&max_prioidx);
+ struct netprio_map *map;
+
+ rtnl_lock();
+ for_each_netdev(&init_net, dev) {
+ map = rtnl_dereference(dev->priomap);
+ if ((!map) ||
+ (map->priomap_len < max_len))
+ extend_netdev_table(dev, max_len);
+ }
+ rtnl_unlock();
+}
+
+static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
+ struct cgroup *cgrp)
+{
+ struct cgroup_netprio_state *cs;
+ int ret;
+
+ cs = kzalloc(sizeof(*cs), GFP_KERNEL);
+ if (!cs)
+ return ERR_PTR(-ENOMEM);
+
+ if (cgrp->parent && cgrp_netprio_state(cgrp->parent)->prioidx) {
+ kfree(cs);
+ return ERR_PTR(-EINVAL);
+ }
+
+ ret = get_prioidx(&cs->prioidx);
+ if (ret != 0) {
+ printk(KERN_WARNING "No space in priority index array\n");
+ kfree(cs);
+ return ERR_PTR(ret);
+ }
+
+ return &cs->css;
+}
+
+static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ struct cgroup_netprio_state *cs;
+ struct net_device *dev;
+ struct netprio_map *map;
+
+ cs = cgrp_netprio_state(cgrp);
+ rtnl_lock();
+ for_each_netdev(&init_net, dev) {
+ map = rtnl_dereference(dev->priomap);
+ if (map)
+ map->priomap[cs->prioidx] = 0;
+ }
+ rtnl_unlock();
+ put_prioidx(cs->prioidx);
+ kfree(cs);
+}
+
+static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft)
+{
+ return (u64)cgrp_netprio_state(cgrp)->prioidx;
+}
+
+static int read_priomap(struct cgroup *cont, struct cftype *cft,
+ struct cgroup_map_cb *cb)
+{
+ struct net_device *dev;
+ u32 prioidx = cgrp_netprio_state(cont)->prioidx;
+ u32 priority;
+ struct netprio_map *map;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(&init_net, dev) {
+ map = rcu_dereference(dev->priomap);
+ priority = map ? map->priomap[prioidx] : 0;
+ cb->fill(cb, dev->name, priority);
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+static int write_priomap(struct cgroup *cgrp, struct cftype *cft,
+ const char *buffer)
+{
+ char *devname = kstrdup(buffer, GFP_KERNEL);
+ int ret = -EINVAL;
+ u32 prioidx = cgrp_netprio_state(cgrp)->prioidx;
+ unsigned long priority;
+ char *priostr;
+ struct net_device *dev;
+ struct netprio_map *map;
+
+ if (!devname)
+ return -ENOMEM;
+
+ /*
+ * Minimally sized valid priomap string
+ */
+ if (strlen(devname) < 3)
+ goto out_free_devname;
+
+ priostr = strstr(devname, " ");
+ if (!priostr)
+ goto out_free_devname;
+
+ /*
+ *Separate the devname from the associated priority
+ *and advance the priostr poitner to the priority value
+ */
+ *priostr = '\0';
+ priostr++;
+
+ /*
+ * If the priostr points to NULL, we're at the end of the passed
+ * in string, and its not a valid write
+ */
+ if (*priostr == '\0')
+ goto out_free_devname;
+
+ ret = kstrtoul(priostr, 10, &priority);
+ if (ret < 0)
+ goto out_free_devname;
+
+ ret = -ENODEV;
+
+ dev = dev_get_by_name(&init_net, devname);
+ if (!dev)
+ goto out_free_devname;
+
+ update_netdev_tables();
+ ret = 0;
+ rcu_read_lock();
+ map = rcu_dereference(dev->priomap);
+ if (map)
+ map->priomap[prioidx] = priority;
+ rcu_read_unlock();
+ dev_put(dev);
+
+out_free_devname:
+ kfree(devname);
+ return ret;
+}
+
+static struct cftype ss_files[] = {
+ {
+ .name = "prioidx",
+ .read_u64 = read_prioidx,
+ },
+ {
+ .name = "ifpriomap",
+ .read_map = read_priomap,
+ .write_string = write_priomap,
+ },
+};
+
+static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files));
+}
+
+static int netprio_device_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = ptr;
+ struct netprio_map *old;
+ u32 max_len = atomic_read(&max_prioidx);
+
+ /*
+ * Note this is called with rtnl_lock held so we have update side
+ * protection on our rcu assignments
+ */
+
+ switch (event) {
+
+ case NETDEV_REGISTER:
+ if (max_len)
+ extend_netdev_table(dev, max_len);
+ break;
+ case NETDEV_UNREGISTER:
+ old = rtnl_dereference(dev->priomap);
+ RCU_INIT_POINTER(dev->priomap, NULL);
+ if (old)
+ kfree_rcu(old, rcu);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block netprio_device_notifier = {
+ .notifier_call = netprio_device_event
+};
+
+static int __init init_cgroup_netprio(void)
+{
+ int ret;
+
+ ret = cgroup_load_subsys(&net_prio_subsys);
+ if (ret)
+ goto out;
+#ifndef CONFIG_NETPRIO_CGROUP
+ smp_wmb();
+ net_prio_subsys_id = net_prio_subsys.subsys_id;
+#endif
+
+ register_netdevice_notifier(&netprio_device_notifier);
+
+out:
+ return ret;
+}
+
+static void __exit exit_cgroup_netprio(void)
+{
+ struct netprio_map *old;
+ struct net_device *dev;
+
+ unregister_netdevice_notifier(&netprio_device_notifier);
+
+ cgroup_unload_subsys(&net_prio_subsys);
+
+#ifndef CONFIG_NETPRIO_CGROUP
+ net_prio_subsys_id = -1;
+ synchronize_rcu();
+#endif
+
+ rtnl_lock();
+ for_each_netdev(&init_net, dev) {
+ old = rtnl_dereference(dev->priomap);
+ RCU_INIT_POINTER(dev->priomap, NULL);
+ if (old)
+ kfree_rcu(old, rcu);
+ }
+ rtnl_unlock();
+}
+
+module_init(init_cgroup_netprio);
+module_exit(exit_cgroup_netprio);
+MODULE_LICENSE("GPL v2");
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 0001c243b35..449fe0f068f 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -1304,7 +1304,7 @@ static ssize_t pktgen_if_write(struct file *file,
scan_ip6(buf, pkt_dev->in6_daddr.s6_addr);
snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_daddr);
- ipv6_addr_copy(&pkt_dev->cur_in6_daddr, &pkt_dev->in6_daddr);
+ pkt_dev->cur_in6_daddr = pkt_dev->in6_daddr;
if (debug)
printk(KERN_DEBUG "pktgen: dst6 set to: %s\n", buf);
@@ -1327,8 +1327,7 @@ static ssize_t pktgen_if_write(struct file *file,
scan_ip6(buf, pkt_dev->min_in6_daddr.s6_addr);
snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->min_in6_daddr);
- ipv6_addr_copy(&pkt_dev->cur_in6_daddr,
- &pkt_dev->min_in6_daddr);
+ pkt_dev->cur_in6_daddr = pkt_dev->min_in6_daddr;
if (debug)
printk(KERN_DEBUG "pktgen: dst6_min set to: %s\n", buf);
@@ -1371,7 +1370,7 @@ static ssize_t pktgen_if_write(struct file *file,
scan_ip6(buf, pkt_dev->in6_saddr.s6_addr);
snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_saddr);
- ipv6_addr_copy(&pkt_dev->cur_in6_saddr, &pkt_dev->in6_saddr);
+ pkt_dev->cur_in6_saddr = pkt_dev->in6_saddr;
if (debug)
printk(KERN_DEBUG "pktgen: src6 set to: %s\n", buf);
@@ -2079,9 +2078,7 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
ifp = ifp->if_next) {
if (ifp->scope == IFA_LINK &&
!(ifp->flags & IFA_F_TENTATIVE)) {
- ipv6_addr_copy(&pkt_dev->
- cur_in6_saddr,
- &ifp->addr);
+ pkt_dev->cur_in6_saddr = ifp->addr;
err = 0;
break;
}
@@ -2958,8 +2955,8 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
iph->payload_len = htons(sizeof(struct udphdr) + datalen);
iph->nexthdr = IPPROTO_UDP;
- ipv6_addr_copy(&iph->daddr, &pkt_dev->cur_in6_daddr);
- ipv6_addr_copy(&iph->saddr, &pkt_dev->cur_in6_saddr);
+ iph->daddr = pkt_dev->cur_in6_daddr;
+ iph->saddr = pkt_dev->cur_in6_saddr;
skb->mac_header = (skb->network_header - ETH_HLEN -
pkt_dev->pkt_overhead);
@@ -3345,7 +3342,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
__netif_tx_lock_bh(txq);
- if (unlikely(netif_tx_queue_frozen_or_stopped(txq))) {
+ if (unlikely(netif_xmit_frozen_or_stopped(txq))) {
ret = NETDEV_TX_BUSY;
pkt_dev->last_ok = 0;
goto unlock;
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 182236b2510..9b570a6a33c 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -26,10 +26,11 @@
* but then some measure against one socket starving all other sockets
* would be needed.
*
- * It was 128 by default. Experiments with real servers show, that
+ * The minimum value of it is 128. Experiments with real servers show that
* it is absolutely not enough even at 100conn/sec. 256 cures most
- * of problems. This value is adjusted to 128 for very small machines
- * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
+ * of problems.
+ * This value is adjusted to 128 for low memory machines,
+ * and it will increase in proportion to the memory of machine.
* Note : Dont forget somaxconn that may limit backlog too.
*/
int sysctl_max_syn_backlog = 256;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 9083e82bdae..dbf2ddafd52 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -273,6 +273,17 @@ EXPORT_SYMBOL_GPL(rtnl_unregister_all);
static LIST_HEAD(link_ops);
+static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind)
+{
+ const struct rtnl_link_ops *ops;
+
+ list_for_each_entry(ops, &link_ops, list) {
+ if (!strcmp(ops->kind, kind))
+ return ops;
+ }
+ return NULL;
+}
+
/**
* __rtnl_link_register - Register rtnl_link_ops with rtnetlink.
* @ops: struct rtnl_link_ops * to register
@@ -285,6 +296,9 @@ static LIST_HEAD(link_ops);
*/
int __rtnl_link_register(struct rtnl_link_ops *ops)
{
+ if (rtnl_link_ops_get(ops->kind))
+ return -EEXIST;
+
if (!ops->dellink)
ops->dellink = unregister_netdevice_queue;
@@ -351,17 +365,6 @@ void rtnl_link_unregister(struct rtnl_link_ops *ops)
}
EXPORT_SYMBOL_GPL(rtnl_link_unregister);
-static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind)
-{
- const struct rtnl_link_ops *ops;
-
- list_for_each_entry(ops, &link_ops, list) {
- if (!strcmp(ops->kind, kind))
- return ops;
- }
- return NULL;
-}
-
static size_t rtnl_link_get_size(const struct net_device *dev)
{
const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index 025233de25f..6fd44606fdd 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -19,6 +19,7 @@ static int __init net_secret_init(void)
}
late_initcall(net_secret_init);
+#ifdef CONFIG_INET
static u32 seq_scale(u32 seq)
{
/*
@@ -33,8 +34,9 @@ static u32 seq_scale(u32 seq)
*/
return seq + (ktime_to_ns(ktime_get_real()) >> 6);
}
+#endif
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#if IS_ENABLED(CONFIG_IPV6)
__u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
__be16 sport, __be16 dport)
{
@@ -132,7 +134,7 @@ u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);
#endif
-#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE)
+#if IS_ENABLED(CONFIG_IP_DCCP)
u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
__be16 sport, __be16 dport)
{
@@ -154,7 +156,7 @@ u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
}
EXPORT_SYMBOL(secure_dccp_sequence_number);
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#if IS_ENABLED(CONFIG_IPV6)
u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
__be16 sport, __be16 dport)
{
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 18a3cebb753..da0c97f2fab 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -245,6 +245,55 @@ nodata:
EXPORT_SYMBOL(__alloc_skb);
/**
+ * build_skb - build a network buffer
+ * @data: data buffer provided by caller
+ *
+ * Allocate a new &sk_buff. Caller provides space holding head and
+ * skb_shared_info. @data must have been allocated by kmalloc()
+ * The return is the new skb buffer.
+ * On a failure the return is %NULL, and @data is not freed.
+ * Notes :
+ * Before IO, driver allocates only data buffer where NIC put incoming frame
+ * Driver should add room at head (NET_SKB_PAD) and
+ * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
+ * After IO, driver calls build_skb(), to allocate sk_buff and populate it
+ * before giving packet to stack.
+ * RX rings only contains data buffers, not full skbs.
+ */
+struct sk_buff *build_skb(void *data)
+{
+ struct skb_shared_info *shinfo;
+ struct sk_buff *skb;
+ unsigned int size;
+
+ skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ size = ksize(data) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+ skb->truesize = SKB_TRUESIZE(size);
+ atomic_set(&skb->users, 1);
+ skb->head = data;
+ skb->data = data;
+ skb_reset_tail_pointer(skb);
+ skb->end = skb->tail + size;
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+ skb->mac_header = ~0U;
+#endif
+
+ /* make sure we initialize shinfo sequentially */
+ shinfo = skb_shinfo(skb);
+ memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
+ atomic_set(&shinfo->dataref, 1);
+ kmemcheck_annotate_variable(shinfo->destructor_arg);
+
+ return skb;
+}
+EXPORT_SYMBOL(build_skb);
+
+/**
* __netdev_alloc_skb - allocate an skbuff for rx on a specific device
* @dev: network device to receive on
* @length: length to allocate
@@ -403,7 +452,7 @@ static void skb_release_head_state(struct sk_buff *skb)
WARN_ON(in_irq());
skb->destructor(skb);
}
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
nf_conntrack_put(skb->nfct);
#endif
#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED
@@ -553,15 +602,14 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
new->ip_summed = old->ip_summed;
skb_copy_queue_mapping(new, old);
new->priority = old->priority;
-#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
+#if IS_ENABLED(CONFIG_IP_VS)
new->ipvs_property = old->ipvs_property;
#endif
new->protocol = old->protocol;
new->mark = old->mark;
new->skb_iif = old->skb_iif;
__nf_copy(new, old);
-#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
- defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
new->nf_trace = old->nf_trace;
#endif
#ifdef CONFIG_NET_SCHED
@@ -791,8 +839,9 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
EXPORT_SYMBOL(skb_copy);
/**
- * pskb_copy - create copy of an sk_buff with private head.
+ * __pskb_copy - create copy of an sk_buff with private head.
* @skb: buffer to copy
+ * @headroom: headroom of new skb
* @gfp_mask: allocation priority
*
* Make a copy of both an &sk_buff and part of its data, located
@@ -803,16 +852,16 @@ EXPORT_SYMBOL(skb_copy);
* The returned buffer has a reference count of 1.
*/
-struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
+struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask)
{
- unsigned int size = skb_end_pointer(skb) - skb->head;
+ unsigned int size = skb_headlen(skb) + headroom;
struct sk_buff *n = alloc_skb(size, gfp_mask);
if (!n)
goto out;
/* Set the data pointer */
- skb_reserve(n, skb_headroom(skb));
+ skb_reserve(n, headroom);
/* Set the tail pointer and length */
skb_put(n, skb_headlen(skb));
/* Copy the bytes */
@@ -848,7 +897,7 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
out:
return n;
}
-EXPORT_SYMBOL(pskb_copy);
+EXPORT_SYMBOL(__pskb_copy);
/**
* pskb_expand_head - reallocate header of &sk_buff
@@ -2230,7 +2279,7 @@ static int skb_prepare_for_shift(struct sk_buff *skb)
* @shiftlen: shift up to this many bytes
*
* Attempts to shift up to shiftlen worth of bytes, which may be less than
- * the length of the skb, from tgt to skb. Returns number bytes shifted.
+ * the length of the skb, from skb to tgt. Returns number bytes shifted.
* It's up to caller to free skb if everything was shifted.
*
* If @tgt runs out of frags, the whole operation is aborted.
@@ -2621,7 +2670,7 @@ EXPORT_SYMBOL_GPL(skb_pull_rcsum);
* a pointer to the first in a list of new skbs for the segments.
* In case of error it returns ERR_PTR(err).
*/
-struct sk_buff *skb_segment(struct sk_buff *skb, u32 features)
+struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features)
{
struct sk_buff *segs = NULL;
struct sk_buff *tail = NULL;
@@ -3169,6 +3218,26 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
}
EXPORT_SYMBOL_GPL(skb_tstamp_tx);
+void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
+{
+ struct sock *sk = skb->sk;
+ struct sock_exterr_skb *serr;
+ int err;
+
+ skb->wifi_acked_valid = 1;
+ skb->wifi_acked = acked;
+
+ serr = SKB_EXT_ERR(skb);
+ memset(serr, 0, sizeof(*serr));
+ serr->ee.ee_errno = ENOMSG;
+ serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
+
+ err = sock_queue_err_skb(sk, skb);
+ if (err)
+ kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
+
/**
* skb_partial_csum_set - set up and verify partial csum values for packet
diff --git a/net/core/sock.c b/net/core/sock.c
index 4ed7b1d12f5..002939cfc06 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -111,6 +111,7 @@
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/user_namespace.h>
+#include <linux/jump_label.h>
#include <asm/uaccess.h>
#include <asm/system.h>
@@ -125,6 +126,7 @@
#include <net/xfrm.h>
#include <linux/ipsec.h>
#include <net/cls_cgroup.h>
+#include <net/netprio_cgroup.h>
#include <linux/filter.h>
@@ -134,6 +136,46 @@
#include <net/tcp.h>
#endif
+static DEFINE_MUTEX(proto_list_mutex);
+static LIST_HEAD(proto_list);
+
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss)
+{
+ struct proto *proto;
+ int ret = 0;
+
+ mutex_lock(&proto_list_mutex);
+ list_for_each_entry(proto, &proto_list, node) {
+ if (proto->init_cgroup) {
+ ret = proto->init_cgroup(cgrp, ss);
+ if (ret)
+ goto out;
+ }
+ }
+
+ mutex_unlock(&proto_list_mutex);
+ return ret;
+out:
+ list_for_each_entry_continue_reverse(proto, &proto_list, node)
+ if (proto->destroy_cgroup)
+ proto->destroy_cgroup(cgrp, ss);
+ mutex_unlock(&proto_list_mutex);
+ return ret;
+}
+
+void mem_cgroup_sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss)
+{
+ struct proto *proto;
+
+ mutex_lock(&proto_list_mutex);
+ list_for_each_entry_reverse(proto, &proto_list, node)
+ if (proto->destroy_cgroup)
+ proto->destroy_cgroup(cgrp, ss);
+ mutex_unlock(&proto_list_mutex);
+}
+#endif
+
/*
* Each address family might have different locking rules, so we have
* one slock key per address family:
@@ -141,6 +183,9 @@
static struct lock_class_key af_family_keys[AF_MAX];
static struct lock_class_key af_family_slock_keys[AF_MAX];
+struct jump_label_key memcg_socket_limit_enabled;
+EXPORT_SYMBOL(memcg_socket_limit_enabled);
+
/*
* Make lock validator output more readable. (we pre-construct these
* strings build-time, so that runtime initialization of socket
@@ -221,10 +266,16 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
EXPORT_SYMBOL(sysctl_optmem_max);
-#if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP)
+#if defined(CONFIG_CGROUPS)
+#if !defined(CONFIG_NET_CLS_CGROUP)
int net_cls_subsys_id = -1;
EXPORT_SYMBOL_GPL(net_cls_subsys_id);
#endif
+#if !defined(CONFIG_NETPRIO_CGROUP)
+int net_prio_subsys_id = -1;
+EXPORT_SYMBOL_GPL(net_prio_subsys_id);
+#endif
+#endif
static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
{
@@ -269,14 +320,14 @@ static void sock_warn_obsolete_bsdism(const char *name)
}
}
-static void sock_disable_timestamp(struct sock *sk, int flag)
+#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
+
+static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
{
- if (sock_flag(sk, flag)) {
- sock_reset_flag(sk, flag);
- if (!sock_flag(sk, SOCK_TIMESTAMP) &&
- !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
+ if (sk->sk_flags & flags) {
+ sk->sk_flags &= ~flags;
+ if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
net_disable_timestamp();
- }
}
}
@@ -288,11 +339,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
unsigned long flags;
struct sk_buff_head *list = &sk->sk_receive_queue;
- /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
- number of warnings when compiling with -W --ANK
- */
- if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
- (unsigned)sk->sk_rcvbuf) {
+ if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
atomic_inc(&sk->sk_drops);
trace_sock_rcvqueue_full(sk, skb);
return -ENOMEM;
@@ -682,7 +729,7 @@ set_rcvbuf:
SOCK_TIMESTAMPING_RX_SOFTWARE);
else
sock_disable_timestamp(sk,
- SOCK_TIMESTAMPING_RX_SOFTWARE);
+ (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
val & SOF_TIMESTAMPING_SOFTWARE);
sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
@@ -740,6 +787,11 @@ set_rcvbuf:
case SO_RXQ_OVFL:
sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
break;
+
+ case SO_WIFI_STATUS:
+ sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
+ break;
+
default:
ret = -ENOPROTOOPT;
break;
@@ -961,6 +1013,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
break;
+ case SO_WIFI_STATUS:
+ v.val = !!sock_flag(sk, SOCK_WIFI_STATUS);
+ break;
+
default:
return -ENOPROTOOPT;
}
@@ -1111,6 +1167,18 @@ void sock_update_classid(struct sock *sk)
sk->sk_classid = classid;
}
EXPORT_SYMBOL(sock_update_classid);
+
+void sock_update_netprioidx(struct sock *sk)
+{
+ struct cgroup_netprio_state *state;
+ if (in_interrupt())
+ return;
+ rcu_read_lock();
+ state = task_netprio_state(current);
+ sk->sk_cgrp_prioidx = state ? state->prioidx : 0;
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(sock_update_netprioidx);
#endif
/**
@@ -1138,6 +1206,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
atomic_set(&sk->sk_wmem_alloc, 1);
sock_update_classid(sk);
+ sock_update_netprioidx(sk);
}
return sk;
@@ -1158,8 +1227,7 @@ static void __sk_free(struct sock *sk)
RCU_INIT_POINTER(sk->sk_filter, NULL);
}
- sock_disable_timestamp(sk, SOCK_TIMESTAMP);
- sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
+ sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
if (atomic_read(&sk->sk_omem_alloc))
printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
@@ -1204,7 +1272,14 @@ void sk_release_kernel(struct sock *sk)
}
EXPORT_SYMBOL(sk_release_kernel);
-struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
+/**
+ * sk_clone_lock - clone a socket, and lock its clone
+ * @sk: the socket to clone
+ * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ *
+ * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
+ */
+struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
{
struct sock *newsk;
@@ -1288,16 +1363,15 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
newsk->sk_wq = NULL;
if (newsk->sk_prot->sockets_allocated)
- percpu_counter_inc(newsk->sk_prot->sockets_allocated);
+ sk_sockets_allocated_inc(newsk);
- if (sock_flag(newsk, SOCK_TIMESTAMP) ||
- sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE))
+ if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
net_enable_timestamp();
}
out:
return newsk;
}
-EXPORT_SYMBOL_GPL(sk_clone);
+EXPORT_SYMBOL_GPL(sk_clone_lock);
void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
{
@@ -1677,30 +1751,34 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
struct proto *prot = sk->sk_prot;
int amt = sk_mem_pages(size);
long allocated;
+ int parent_status = UNDER_LIMIT;
sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
- allocated = atomic_long_add_return(amt, prot->memory_allocated);
+
+ allocated = sk_memory_allocated_add(sk, amt, &parent_status);
/* Under limit. */
- if (allocated <= prot->sysctl_mem[0]) {
- if (prot->memory_pressure && *prot->memory_pressure)
- *prot->memory_pressure = 0;
+ if (parent_status == UNDER_LIMIT &&
+ allocated <= sk_prot_mem_limits(sk, 0)) {
+ sk_leave_memory_pressure(sk);
return 1;
}
- /* Under pressure. */
- if (allocated > prot->sysctl_mem[1])
- if (prot->enter_memory_pressure)
- prot->enter_memory_pressure(sk);
+ /* Under pressure. (we or our parents) */
+ if ((parent_status > SOFT_LIMIT) ||
+ allocated > sk_prot_mem_limits(sk, 1))
+ sk_enter_memory_pressure(sk);
- /* Over hard limit. */
- if (allocated > prot->sysctl_mem[2])
+ /* Over hard limit (we or our parents) */
+ if ((parent_status == OVER_LIMIT) ||
+ (allocated > sk_prot_mem_limits(sk, 2)))
goto suppress_allocation;
/* guarantee minimum buffer size under pressure */
if (kind == SK_MEM_RECV) {
if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
return 1;
+
} else { /* SK_MEM_SEND */
if (sk->sk_type == SOCK_STREAM) {
if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
@@ -1710,13 +1788,13 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
return 1;
}
- if (prot->memory_pressure) {
+ if (sk_has_memory_pressure(sk)) {
int alloc;
- if (!*prot->memory_pressure)
+ if (!sk_under_memory_pressure(sk))
return 1;
- alloc = percpu_counter_read_positive(prot->sockets_allocated);
- if (prot->sysctl_mem[2] > alloc *
+ alloc = sk_sockets_allocated_read_positive(sk);
+ if (sk_prot_mem_limits(sk, 2) > alloc *
sk_mem_pages(sk->sk_wmem_queued +
atomic_read(&sk->sk_rmem_alloc) +
sk->sk_forward_alloc))
@@ -1739,7 +1817,9 @@ suppress_allocation:
/* Alas. Undo changes. */
sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
- atomic_long_sub(amt, prot->memory_allocated);
+
+ sk_memory_allocated_sub(sk, amt, parent_status);
+
return 0;
}
EXPORT_SYMBOL(__sk_mem_schedule);
@@ -1750,15 +1830,13 @@ EXPORT_SYMBOL(__sk_mem_schedule);
*/
void __sk_mem_reclaim(struct sock *sk)
{
- struct proto *prot = sk->sk_prot;
-
- atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
- prot->memory_allocated);
+ sk_memory_allocated_sub(sk,
+ sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, 0);
sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
- if (prot->memory_pressure && *prot->memory_pressure &&
- (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0]))
- *prot->memory_pressure = 0;
+ if (sk_under_memory_pressure(sk) &&
+ (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
+ sk_leave_memory_pressure(sk);
}
EXPORT_SYMBOL(__sk_mem_reclaim);
@@ -2129,16 +2207,15 @@ EXPORT_SYMBOL(sock_get_timestampns);
void sock_enable_timestamp(struct sock *sk, int flag)
{
if (!sock_flag(sk, flag)) {
+ unsigned long previous_flags = sk->sk_flags;
+
sock_set_flag(sk, flag);
/*
* we just set one of the two flags which require net
* time stamping, but time stamping might have been on
* already because of the other one
*/
- if (!sock_flag(sk,
- flag == SOCK_TIMESTAMP ?
- SOCK_TIMESTAMPING_RX_SOFTWARE :
- SOCK_TIMESTAMP))
+ if (!(previous_flags & SK_FLAGS_TIMESTAMP))
net_enable_timestamp();
}
}
@@ -2250,9 +2327,6 @@ void sk_common_release(struct sock *sk)
}
EXPORT_SYMBOL(sk_common_release);
-static DEFINE_RWLOCK(proto_list_lock);
-static LIST_HEAD(proto_list);
-
#ifdef CONFIG_PROC_FS
#define PROTO_INUSE_NR 64 /* should be enough for the first time */
struct prot_inuse {
@@ -2401,10 +2475,10 @@ int proto_register(struct proto *prot, int alloc_slab)
}
}
- write_lock(&proto_list_lock);
+ mutex_lock(&proto_list_mutex);
list_add(&prot->node, &proto_list);
assign_proto_idx(prot);
- write_unlock(&proto_list_lock);
+ mutex_unlock(&proto_list_mutex);
return 0;
out_free_timewait_sock_slab_name:
@@ -2427,10 +2501,10 @@ EXPORT_SYMBOL(proto_register);
void proto_unregister(struct proto *prot)
{
- write_lock(&proto_list_lock);
+ mutex_lock(&proto_list_mutex);
release_proto_idx(prot);
list_del(&prot->node);
- write_unlock(&proto_list_lock);
+ mutex_unlock(&proto_list_mutex);
if (prot->slab != NULL) {
kmem_cache_destroy(prot->slab);
@@ -2453,9 +2527,9 @@ EXPORT_SYMBOL(proto_unregister);
#ifdef CONFIG_PROC_FS
static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(proto_list_lock)
+ __acquires(proto_list_mutex)
{
- read_lock(&proto_list_lock);
+ mutex_lock(&proto_list_mutex);
return seq_list_start_head(&proto_list, *pos);
}
@@ -2465,25 +2539,36 @@ static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void proto_seq_stop(struct seq_file *seq, void *v)
- __releases(proto_list_lock)
+ __releases(proto_list_mutex)
{
- read_unlock(&proto_list_lock);
+ mutex_unlock(&proto_list_mutex);
}
static char proto_method_implemented(const void *method)
{
return method == NULL ? 'n' : 'y';
}
+static long sock_prot_memory_allocated(struct proto *proto)
+{
+ return proto->memory_allocated != NULL ? proto_memory_allocated(proto): -1L;
+}
+
+static char *sock_prot_memory_pressure(struct proto *proto)
+{
+ return proto->memory_pressure != NULL ?
+ proto_memory_pressure(proto) ? "yes" : "no" : "NI";
+}
static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
{
+
seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
proto->name,
proto->obj_size,
sock_prot_inuse_get(seq_file_net(seq), proto),
- proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L,
- proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
+ sock_prot_memory_allocated(proto),
+ sock_prot_memory_pressure(proto),
proto->max_header,
proto->slab == NULL ? "no" : "yes",
module_name(proto->owner),
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
new file mode 100644
index 00000000000..b9868e1fd62
--- /dev/null
+++ b/net/core/sock_diag.c
@@ -0,0 +1,192 @@
+#include <linux/mutex.h>
+#include <linux/socket.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/net_namespace.h>
+#include <linux/module.h>
+#include <linux/rtnetlink.h>
+#include <net/sock.h>
+
+#include <linux/inet_diag.h>
+#include <linux/sock_diag.h>
+
+static struct sock_diag_handler *sock_diag_handlers[AF_MAX];
+static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh);
+static DEFINE_MUTEX(sock_diag_table_mutex);
+
+int sock_diag_check_cookie(void *sk, __u32 *cookie)
+{
+ if ((cookie[0] != INET_DIAG_NOCOOKIE ||
+ cookie[1] != INET_DIAG_NOCOOKIE) &&
+ ((u32)(unsigned long)sk != cookie[0] ||
+ (u32)((((unsigned long)sk) >> 31) >> 1) != cookie[1]))
+ return -ESTALE;
+ else
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sock_diag_check_cookie);
+
+void sock_diag_save_cookie(void *sk, __u32 *cookie)
+{
+ cookie[0] = (u32)(unsigned long)sk;
+ cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
+}
+EXPORT_SYMBOL_GPL(sock_diag_save_cookie);
+
+int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype)
+{
+ __u32 *mem;
+
+ mem = RTA_DATA(__RTA_PUT(skb, attrtype, SK_MEMINFO_VARS * sizeof(__u32)));
+
+ mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
+ mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
+ mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
+ mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
+ mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
+ mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
+ mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
+
+ return 0;
+
+rtattr_failure:
+ return -EMSGSIZE;
+}
+EXPORT_SYMBOL_GPL(sock_diag_put_meminfo);
+
+void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
+{
+ mutex_lock(&sock_diag_table_mutex);
+ inet_rcv_compat = fn;
+ mutex_unlock(&sock_diag_table_mutex);
+}
+EXPORT_SYMBOL_GPL(sock_diag_register_inet_compat);
+
+void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
+{
+ mutex_lock(&sock_diag_table_mutex);
+ inet_rcv_compat = NULL;
+ mutex_unlock(&sock_diag_table_mutex);
+}
+EXPORT_SYMBOL_GPL(sock_diag_unregister_inet_compat);
+
+int sock_diag_register(struct sock_diag_handler *hndl)
+{
+ int err = 0;
+
+ if (hndl->family >= AF_MAX)
+ return -EINVAL;
+
+ mutex_lock(&sock_diag_table_mutex);
+ if (sock_diag_handlers[hndl->family])
+ err = -EBUSY;
+ else
+ sock_diag_handlers[hndl->family] = hndl;
+ mutex_unlock(&sock_diag_table_mutex);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(sock_diag_register);
+
+void sock_diag_unregister(struct sock_diag_handler *hnld)
+{
+ int family = hnld->family;
+
+ if (family >= AF_MAX)
+ return;
+
+ mutex_lock(&sock_diag_table_mutex);
+ BUG_ON(sock_diag_handlers[family] != hnld);
+ sock_diag_handlers[family] = NULL;
+ mutex_unlock(&sock_diag_table_mutex);
+}
+EXPORT_SYMBOL_GPL(sock_diag_unregister);
+
+static inline struct sock_diag_handler *sock_diag_lock_handler(int family)
+{
+ if (sock_diag_handlers[family] == NULL)
+ request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
+ NETLINK_SOCK_DIAG, family);
+
+ mutex_lock(&sock_diag_table_mutex);
+ return sock_diag_handlers[family];
+}
+
+static inline void sock_diag_unlock_handler(struct sock_diag_handler *h)
+{
+ mutex_unlock(&sock_diag_table_mutex);
+}
+
+static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ int err;
+ struct sock_diag_req *req = NLMSG_DATA(nlh);
+ struct sock_diag_handler *hndl;
+
+ if (nlmsg_len(nlh) < sizeof(*req))
+ return -EINVAL;
+
+ hndl = sock_diag_lock_handler(req->sdiag_family);
+ if (hndl == NULL)
+ err = -ENOENT;
+ else
+ err = hndl->dump(skb, nlh);
+ sock_diag_unlock_handler(hndl);
+
+ return err;
+}
+
+static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ int ret;
+
+ switch (nlh->nlmsg_type) {
+ case TCPDIAG_GETSOCK:
+ case DCCPDIAG_GETSOCK:
+ if (inet_rcv_compat == NULL)
+ request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
+ NETLINK_SOCK_DIAG, AF_INET);
+
+ mutex_lock(&sock_diag_table_mutex);
+ if (inet_rcv_compat != NULL)
+ ret = inet_rcv_compat(skb, nlh);
+ else
+ ret = -EOPNOTSUPP;
+ mutex_unlock(&sock_diag_table_mutex);
+
+ return ret;
+ case SOCK_DIAG_BY_FAMILY:
+ return __sock_diag_rcv_msg(skb, nlh);
+ default:
+ return -EINVAL;
+ }
+}
+
+static DEFINE_MUTEX(sock_diag_mutex);
+
+static void sock_diag_rcv(struct sk_buff *skb)
+{
+ mutex_lock(&sock_diag_mutex);
+ netlink_rcv_skb(skb, &sock_diag_rcv_msg);
+ mutex_unlock(&sock_diag_mutex);
+}
+
+struct sock *sock_diag_nlsk;
+EXPORT_SYMBOL_GPL(sock_diag_nlsk);
+
+static int __init sock_diag_init(void)
+{
+ sock_diag_nlsk = netlink_kernel_create(&init_net, NETLINK_SOCK_DIAG, 0,
+ sock_diag_rcv, NULL, THIS_MODULE);
+ return sock_diag_nlsk == NULL ? -ENOMEM : 0;
+}
+
+static void __exit sock_diag_exit(void)
+{
+ netlink_kernel_release(sock_diag_nlsk);
+}
+
+module_init(sock_diag_init);
+module_exit(sock_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_SOCK_DIAG);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 77a65f03148..d05559d4d9c 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -68,8 +68,13 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write,
if (sock_table != orig_sock_table) {
rcu_assign_pointer(rps_sock_flow_table, sock_table);
- synchronize_rcu();
- vfree(orig_sock_table);
+ if (sock_table)
+ jump_label_inc(&rps_needed);
+ if (orig_sock_table) {
+ jump_label_dec(&rps_needed);
+ synchronize_rcu();
+ vfree(orig_sock_table);
+ }
}
}