summaryrefslogtreecommitdiffstats
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/dev.c138
-rw-r--r--net/core/dev_addr_lists.c40
-rw-r--r--net/core/dst.c6
-rw-r--r--net/core/ethtool.c12
-rw-r--r--net/core/fib_rules.c6
-rw-r--r--net/core/filter.c27
-rw-r--r--net/core/link_watch.c29
-rw-r--r--net/core/neighbour.c10
-rw-r--r--net/core/net-sysfs.c18
-rw-r--r--net/core/netpoll.c5
-rw-r--r--net/core/netprio_cgroup.c102
-rw-r--r--net/core/request_sock.c95
-rw-r--r--net/core/rtnetlink.c38
-rw-r--r--net/core/scm.c47
-rw-r--r--net/core/secure_seq.c1
-rw-r--r--net/core/skbuff.c86
-rw-r--r--net/core/sock.c89
-rw-r--r--net/core/sock_diag.c3
-rw-r--r--net/core/utils.c20
19 files changed, 475 insertions, 297 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index 89e33a5d4d9..1e0a1847c3b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -959,18 +959,30 @@ int dev_alloc_name(struct net_device *dev, const char *name)
}
EXPORT_SYMBOL(dev_alloc_name);
-static int dev_get_valid_name(struct net_device *dev, const char *name)
+static int dev_alloc_name_ns(struct net *net,
+ struct net_device *dev,
+ const char *name)
{
- struct net *net;
+ char buf[IFNAMSIZ];
+ int ret;
- BUG_ON(!dev_net(dev));
- net = dev_net(dev);
+ ret = __dev_alloc_name(net, name, buf);
+ if (ret >= 0)
+ strlcpy(dev->name, buf, IFNAMSIZ);
+ return ret;
+}
+
+static int dev_get_valid_name(struct net *net,
+ struct net_device *dev,
+ const char *name)
+{
+ BUG_ON(!net);
if (!dev_valid_name(name))
return -EINVAL;
if (strchr(name, '%'))
- return dev_alloc_name(dev, name);
+ return dev_alloc_name_ns(net, dev, name);
else if (__dev_get_by_name(net, name))
return -EEXIST;
else if (dev->name != name)
@@ -1006,7 +1018,7 @@ int dev_change_name(struct net_device *dev, const char *newname)
memcpy(oldname, dev->name, IFNAMSIZ);
- err = dev_get_valid_name(dev, newname);
+ err = dev_get_valid_name(net, dev, newname);
if (err < 0)
return err;
@@ -1109,11 +1121,23 @@ void netdev_state_change(struct net_device *dev)
}
EXPORT_SYMBOL(netdev_state_change);
-int netdev_bonding_change(struct net_device *dev, unsigned long event)
+/**
+ * netdev_notify_peers - notify network peers about existence of @dev
+ * @dev: network device
+ *
+ * Generate traffic such that interested network peers are aware of
+ * @dev, such as by generating a gratuitous ARP. This may be used when
+ * a device wants to inform the rest of the network about some sort of
+ * reconfiguration such as a failover event or virtual machine
+ * migration.
+ */
+void netdev_notify_peers(struct net_device *dev)
{
- return call_netdevice_notifiers(event, dev);
+ rtnl_lock();
+ call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
+ rtnl_unlock();
}
-EXPORT_SYMBOL(netdev_bonding_change);
+EXPORT_SYMBOL(netdev_notify_peers);
/**
* dev_load - load a network module
@@ -1394,7 +1418,6 @@ rollback:
nb->notifier_call(nb, NETDEV_DOWN, dev);
}
nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
- nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
}
}
@@ -1436,7 +1459,6 @@ int unregister_netdevice_notifier(struct notifier_block *nb)
nb->notifier_call(nb, NETDEV_DOWN, dev);
}
nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
- nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
}
}
unlock:
@@ -2175,9 +2197,7 @@ EXPORT_SYMBOL(netif_skb_features);
/*
* Returns true if either:
* 1. skb has frag_list and the device doesn't support FRAGLIST, or
- * 2. skb is fragmented and the device does not support SG, or if
- * at least one of fragments is in highmem and device does not
- * support DMA from it.
+ * 2. skb is fragmented and the device does not support SG.
*/
static inline int skb_needs_linearize(struct sk_buff *skb,
int features)
@@ -2206,9 +2226,6 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
skb_dst_drop(skb);
- if (!list_empty(&ptype_all))
- dev_queue_xmit_nit(skb, dev);
-
features = netif_skb_features(skb);
if (vlan_tx_tag_present(skb) &&
@@ -2243,6 +2260,9 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
}
}
+ if (!list_empty(&ptype_all))
+ dev_queue_xmit_nit(skb, dev);
+
skb_len = skb->len;
rc = ops->ndo_start_xmit(skb, dev);
trace_net_dev_xmit(skb, rc, dev, skb_len);
@@ -2265,6 +2285,9 @@ gso:
if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
skb_dst_drop(nskb);
+ if (!list_empty(&ptype_all))
+ dev_queue_xmit_nit(nskb, dev);
+
skb_len = nskb->len;
rc = ops->ndo_start_xmit(nskb, dev);
trace_net_dev_xmit(nskb, rc, dev, skb_len);
@@ -2374,8 +2397,8 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
#endif
}
-static struct netdev_queue *dev_pick_tx(struct net_device *dev,
- struct sk_buff *skb)
+struct netdev_queue *netdev_pick_tx(struct net_device *dev,
+ struct sk_buff *skb)
{
int queue_index;
const struct net_device_ops *ops = dev->netdev_ops;
@@ -2549,7 +2572,7 @@ int dev_queue_xmit(struct sk_buff *skb)
skb_update_prio(skb);
- txq = dev_pick_tx(dev, skb);
+ txq = netdev_pick_tx(dev, skb);
q = rcu_dereference_bh(txq->qdisc);
#ifdef CONFIG_NET_CLS_ACT
@@ -2622,6 +2645,8 @@ EXPORT_SYMBOL(dev_queue_xmit);
=======================================================================*/
int netdev_max_backlog __read_mostly = 1000;
+EXPORT_SYMBOL(netdev_max_backlog);
+
int netdev_tstamp_prequeue __read_mostly = 1;
int netdev_budget __read_mostly = 300;
int weight_p __read_mostly = 64; /* old backlog weight */
@@ -4512,8 +4537,8 @@ static void dev_change_rx_flags(struct net_device *dev, int flags)
static int __dev_set_promiscuity(struct net_device *dev, int inc)
{
unsigned int old_flags = dev->flags;
- uid_t uid;
- gid_t gid;
+ kuid_t uid;
+ kgid_t gid;
ASSERT_RTNL();
@@ -4544,8 +4569,9 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc)
"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
dev->name, (dev->flags & IFF_PROMISC),
(old_flags & IFF_PROMISC),
- audit_get_loginuid(current),
- uid, gid,
+ from_kuid(&init_user_ns, audit_get_loginuid(current)),
+ from_kuid(&init_user_ns, uid),
+ from_kgid(&init_user_ns, gid),
audit_get_sessionid(current));
}
@@ -5238,12 +5264,12 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
*/
static int dev_new_index(struct net *net)
{
- static int ifindex;
+ int ifindex = net->ifindex;
for (;;) {
if (++ifindex <= 0)
ifindex = 1;
if (!__dev_get_by_index(net, ifindex))
- return ifindex;
+ return net->ifindex = ifindex;
}
}
@@ -5321,10 +5347,6 @@ static void rollback_registered_many(struct list_head *head)
netdev_unregister_kobject(dev);
}
- /* Process any work delayed until the end of the batch */
- dev = list_first_entry(head, struct net_device, unreg_list);
- call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
-
synchronize_net();
list_for_each_entry(dev, head, unreg_list)
@@ -5582,7 +5604,7 @@ int register_netdevice(struct net_device *dev)
dev->iflink = -1;
- ret = dev_get_valid_name(dev, dev->name);
+ ret = dev_get_valid_name(net, dev, dev->name);
if (ret < 0)
goto out;
@@ -5596,7 +5618,12 @@ int register_netdevice(struct net_device *dev)
}
}
- dev->ifindex = dev_new_index(net);
+ ret = -EBUSY;
+ if (!dev->ifindex)
+ dev->ifindex = dev_new_index(net);
+ else if (__dev_get_by_index(net, dev->ifindex))
+ goto err_uninit;
+
if (dev->iflink == -1)
dev->iflink = dev->ifindex;
@@ -5639,6 +5666,8 @@ int register_netdevice(struct net_device *dev)
set_bit(__LINK_STATE_PRESENT, &dev->state);
+ linkwatch_init_dev(dev);
+
dev_init_scheduler(dev);
dev_hold(dev);
list_netdevice(dev);
@@ -5772,9 +5801,12 @@ static void netdev_wait_allrefs(struct net_device *dev)
/* Rebroadcast unregister notification */
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
- /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
- * should have already handle it the first time */
+ __rtnl_unlock();
+ rcu_barrier();
+ rtnl_lock();
+
+ call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
&dev->state)) {
/* We must not have linkwatch events
@@ -5836,9 +5868,8 @@ void netdev_run_todo(void)
__rtnl_unlock();
- /* Wait for rcu callbacks to finish before attempting to drain
- * the device list. This usually avoids a 250ms wait.
- */
+
+ /* Wait for rcu callbacks to finish before next phase */
if (!list_empty(&list))
rcu_barrier();
@@ -5847,6 +5878,10 @@ void netdev_run_todo(void)
= list_first_entry(&list, struct net_device, todo_list);
list_del(&dev->todo_list);
+ rtnl_lock();
+ call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
+ __rtnl_unlock();
+
if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
pr_err("network todo '%s' but state %d\n",
dev->name, dev->reg_state);
@@ -5942,6 +5977,8 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
return queue;
}
+static const struct ethtool_ops default_ethtool_ops;
+
/**
* alloc_netdev_mqs - allocate network device
* @sizeof_priv: size of private data to allocate space for
@@ -6029,6 +6066,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
strcpy(dev->name, name);
dev->group = INIT_NETDEV_GROUP;
+ if (!dev->ethtool_ops)
+ dev->ethtool_ops = &default_ethtool_ops;
return dev;
free_all:
@@ -6213,7 +6252,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
/* We get here if we can't use the current device name */
if (!pat)
goto out;
- if (dev_get_valid_name(dev, pat) < 0)
+ if (dev_get_valid_name(net, dev, pat) < 0)
goto out;
}
@@ -6241,7 +6280,8 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
the device is just moving and can keep their slaves up.
*/
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
- call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
+ rcu_barrier();
+ call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
/*
@@ -6424,22 +6464,26 @@ const char *netdev_drivername(const struct net_device *dev)
return empty;
}
-int __netdev_printk(const char *level, const struct net_device *dev,
+static int __netdev_printk(const char *level, const struct net_device *dev,
struct va_format *vaf)
{
int r;
- if (dev && dev->dev.parent)
- r = dev_printk(level, dev->dev.parent, "%s: %pV",
- netdev_name(dev), vaf);
- else if (dev)
+ if (dev && dev->dev.parent) {
+ r = dev_printk_emit(level[1] - '0',
+ dev->dev.parent,
+ "%s %s %s: %pV",
+ dev_driver_string(dev->dev.parent),
+ dev_name(dev->dev.parent),
+ netdev_name(dev), vaf);
+ } else if (dev) {
r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
- else
+ } else {
r = printk("%s(NULL net_device): %pV", level, vaf);
+ }
return r;
}
-EXPORT_SYMBOL(__netdev_printk);
int netdev_printk(const char *level, const struct net_device *dev,
const char *format, ...)
@@ -6454,6 +6498,7 @@ int netdev_printk(const char *level, const struct net_device *dev,
vaf.va = &args;
r = __netdev_printk(level, dev, &vaf);
+
va_end(args);
return r;
@@ -6473,6 +6518,7 @@ int func(const struct net_device *dev, const char *fmt, ...) \
vaf.va = &args; \
\
r = __netdev_printk(level, dev, &vaf); \
+ \
va_end(args); \
\
return r; \
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index c4cc2bc49f0..87cc17db2d5 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -22,7 +22,7 @@
*/
static int __hw_addr_create_ex(struct netdev_hw_addr_list *list,
- unsigned char *addr, int addr_len,
+ const unsigned char *addr, int addr_len,
unsigned char addr_type, bool global)
{
struct netdev_hw_addr *ha;
@@ -46,7 +46,7 @@ static int __hw_addr_create_ex(struct netdev_hw_addr_list *list,
}
static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,
- unsigned char *addr, int addr_len,
+ const unsigned char *addr, int addr_len,
unsigned char addr_type, bool global)
{
struct netdev_hw_addr *ha;
@@ -72,14 +72,15 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,
return __hw_addr_create_ex(list, addr, addr_len, addr_type, global);
}
-static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr,
- int addr_len, unsigned char addr_type)
+static int __hw_addr_add(struct netdev_hw_addr_list *list,
+ const unsigned char *addr, int addr_len,
+ unsigned char addr_type)
{
return __hw_addr_add_ex(list, addr, addr_len, addr_type, false);
}
static int __hw_addr_del_ex(struct netdev_hw_addr_list *list,
- unsigned char *addr, int addr_len,
+ const unsigned char *addr, int addr_len,
unsigned char addr_type, bool global)
{
struct netdev_hw_addr *ha;
@@ -104,8 +105,9 @@ static int __hw_addr_del_ex(struct netdev_hw_addr_list *list,
return -ENOENT;
}
-static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr,
- int addr_len, unsigned char addr_type)
+static int __hw_addr_del(struct netdev_hw_addr_list *list,
+ const unsigned char *addr, int addr_len,
+ unsigned char addr_type)
{
return __hw_addr_del_ex(list, addr, addr_len, addr_type, false);
}
@@ -278,7 +280,7 @@ EXPORT_SYMBOL(dev_addr_init);
*
* The caller must hold the rtnl_mutex.
*/
-int dev_addr_add(struct net_device *dev, unsigned char *addr,
+int dev_addr_add(struct net_device *dev, const unsigned char *addr,
unsigned char addr_type)
{
int err;
@@ -303,7 +305,7 @@ EXPORT_SYMBOL(dev_addr_add);
*
* The caller must hold the rtnl_mutex.
*/
-int dev_addr_del(struct net_device *dev, unsigned char *addr,
+int dev_addr_del(struct net_device *dev, const unsigned char *addr,
unsigned char addr_type)
{
int err;
@@ -390,7 +392,7 @@ EXPORT_SYMBOL(dev_addr_del_multiple);
* @dev: device
* @addr: address to add
*/
-int dev_uc_add_excl(struct net_device *dev, unsigned char *addr)
+int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr)
{
struct netdev_hw_addr *ha;
int err;
@@ -421,7 +423,7 @@ EXPORT_SYMBOL(dev_uc_add_excl);
* Add a secondary unicast address to the device or increase
* the reference count if it already exists.
*/
-int dev_uc_add(struct net_device *dev, unsigned char *addr)
+int dev_uc_add(struct net_device *dev, const unsigned char *addr)
{
int err;
@@ -443,7 +445,7 @@ EXPORT_SYMBOL(dev_uc_add);
* Release reference to a secondary unicast address and remove it
* from the device if the reference count drops to zero.
*/
-int dev_uc_del(struct net_device *dev, unsigned char *addr)
+int dev_uc_del(struct net_device *dev, const unsigned char *addr)
{
int err;
@@ -543,7 +545,7 @@ EXPORT_SYMBOL(dev_uc_init);
* @dev: device
* @addr: address to add
*/
-int dev_mc_add_excl(struct net_device *dev, unsigned char *addr)
+int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr)
{
struct netdev_hw_addr *ha;
int err;
@@ -566,7 +568,7 @@ out:
}
EXPORT_SYMBOL(dev_mc_add_excl);
-static int __dev_mc_add(struct net_device *dev, unsigned char *addr,
+static int __dev_mc_add(struct net_device *dev, const unsigned char *addr,
bool global)
{
int err;
@@ -587,7 +589,7 @@ static int __dev_mc_add(struct net_device *dev, unsigned char *addr,
* Add a multicast address to the device or increase
* the reference count if it already exists.
*/
-int dev_mc_add(struct net_device *dev, unsigned char *addr)
+int dev_mc_add(struct net_device *dev, const unsigned char *addr)
{
return __dev_mc_add(dev, addr, false);
}
@@ -600,13 +602,13 @@ EXPORT_SYMBOL(dev_mc_add);
*
* Add a global multicast address to the device.
*/
-int dev_mc_add_global(struct net_device *dev, unsigned char *addr)
+int dev_mc_add_global(struct net_device *dev, const unsigned char *addr)
{
return __dev_mc_add(dev, addr, true);
}
EXPORT_SYMBOL(dev_mc_add_global);
-static int __dev_mc_del(struct net_device *dev, unsigned char *addr,
+static int __dev_mc_del(struct net_device *dev, const unsigned char *addr,
bool global)
{
int err;
@@ -628,7 +630,7 @@ static int __dev_mc_del(struct net_device *dev, unsigned char *addr,
* Release reference to a multicast address and remove it
* from the device if the reference count drops to zero.
*/
-int dev_mc_del(struct net_device *dev, unsigned char *addr)
+int dev_mc_del(struct net_device *dev, const unsigned char *addr)
{
return __dev_mc_del(dev, addr, false);
}
@@ -642,7 +644,7 @@ EXPORT_SYMBOL(dev_mc_del);
* Release reference to a multicast address and remove it
* from the device if the reference count drops to zero.
*/
-int dev_mc_del_global(struct net_device *dev, unsigned char *addr)
+int dev_mc_del_global(struct net_device *dev, const unsigned char *addr)
{
return __dev_mc_del(dev, addr, true);
}
diff --git a/net/core/dst.c b/net/core/dst.c
index 56d63612e1e..ee6153e2cf4 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -222,8 +222,8 @@ void __dst_free(struct dst_entry *dst)
if (dst_garbage.timer_inc > DST_GC_INC) {
dst_garbage.timer_inc = DST_GC_INC;
dst_garbage.timer_expires = DST_GC_MIN;
- cancel_delayed_work(&dst_gc_work);
- schedule_delayed_work(&dst_gc_work, dst_garbage.timer_expires);
+ mod_delayed_work(system_wq, &dst_gc_work,
+ dst_garbage.timer_expires);
}
spin_unlock_bh(&dst_garbage.lock);
}
@@ -374,7 +374,7 @@ static int dst_dev_event(struct notifier_block *this, unsigned long event,
struct dst_entry *dst, *last = NULL;
switch (event) {
- case NETDEV_UNREGISTER:
+ case NETDEV_UNREGISTER_FINAL:
case NETDEV_DOWN:
mutex_lock(&dst_gc_mutex);
for (dst = dst_busy_list; dst; dst = dst->next) {
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index cbf033dcaf1..4d64cc2e3fa 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1426,18 +1426,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
if (copy_from_user(&ethcmd, useraddr, sizeof(ethcmd)))
return -EFAULT;
- if (!dev->ethtool_ops) {
- /* A few commands do not require any driver support,
- * are unprivileged, and do not change anything, so we
- * can take a shortcut to them. */
- if (ethcmd == ETHTOOL_GDRVINFO)
- return ethtool_get_drvinfo(dev, useraddr);
- else if (ethcmd == ETHTOOL_GET_TS_INFO)
- return ethtool_get_ts_info(dev, useraddr);
- else
- return -EOPNOTSUPP;
- }
-
/* Allow some commands to be done by anyone */
switch (ethcmd) {
case ETHTOOL_GSET:
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index ab7db83236c..58a4ba27dfe 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -402,7 +402,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
if (unresolved)
ops->unresolved_rules++;
- notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).pid);
+ notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid);
flush_route_cache(ops);
rules_ops_put(ops);
return 0;
@@ -500,7 +500,7 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
}
notify_rule_change(RTM_DELRULE, rule, ops, nlh,
- NETLINK_CB(skb).pid);
+ NETLINK_CB(skb).portid);
if (ops->delete)
ops->delete(rule);
fib_rule_put(rule);
@@ -601,7 +601,7 @@ static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb,
if (idx < cb->args[1])
goto skip;
- if (fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).pid,
+ if (fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, RTM_NEWRULE,
NLM_F_MULTI, ops) < 0)
break;
diff --git a/net/core/filter.c b/net/core/filter.c
index 907efd27ec7..3d92ebb7fbc 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -167,6 +167,14 @@ unsigned int sk_run_filter(const struct sk_buff *skb,
case BPF_S_ALU_DIV_K:
A = reciprocal_divide(A, K);
continue;
+ case BPF_S_ALU_MOD_X:
+ if (X == 0)
+ return 0;
+ A %= X;
+ continue;
+ case BPF_S_ALU_MOD_K:
+ A %= K;
+ continue;
case BPF_S_ALU_AND_X:
A &= X;
continue;
@@ -179,6 +187,13 @@ unsigned int sk_run_filter(const struct sk_buff *skb,
case BPF_S_ALU_OR_K:
A |= K;
continue;
+ case BPF_S_ANC_ALU_XOR_X:
+ case BPF_S_ALU_XOR_X:
+ A ^= X;
+ continue;
+ case BPF_S_ALU_XOR_K:
+ A ^= K;
+ continue;
case BPF_S_ALU_LSH_X:
A <<= X;
continue;
@@ -326,9 +341,6 @@ load_b:
case BPF_S_ANC_CPU:
A = raw_smp_processor_id();
continue;
- case BPF_S_ANC_ALU_XOR_X:
- A ^= X;
- continue;
case BPF_S_ANC_NLATTR: {
struct nlattr *nla;
@@ -469,10 +481,14 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
[BPF_ALU|BPF_MUL|BPF_K] = BPF_S_ALU_MUL_K,
[BPF_ALU|BPF_MUL|BPF_X] = BPF_S_ALU_MUL_X,
[BPF_ALU|BPF_DIV|BPF_X] = BPF_S_ALU_DIV_X,
+ [BPF_ALU|BPF_MOD|BPF_K] = BPF_S_ALU_MOD_K,
+ [BPF_ALU|BPF_MOD|BPF_X] = BPF_S_ALU_MOD_X,
[BPF_ALU|BPF_AND|BPF_K] = BPF_S_ALU_AND_K,
[BPF_ALU|BPF_AND|BPF_X] = BPF_S_ALU_AND_X,
[BPF_ALU|BPF_OR|BPF_K] = BPF_S_ALU_OR_K,
[BPF_ALU|BPF_OR|BPF_X] = BPF_S_ALU_OR_X,
+ [BPF_ALU|BPF_XOR|BPF_K] = BPF_S_ALU_XOR_K,
+ [BPF_ALU|BPF_XOR|BPF_X] = BPF_S_ALU_XOR_X,
[BPF_ALU|BPF_LSH|BPF_K] = BPF_S_ALU_LSH_K,
[BPF_ALU|BPF_LSH|BPF_X] = BPF_S_ALU_LSH_X,
[BPF_ALU|BPF_RSH|BPF_K] = BPF_S_ALU_RSH_K,
@@ -531,6 +547,11 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
return -EINVAL;
ftest->k = reciprocal_value(ftest->k);
break;
+ case BPF_S_ALU_MOD_K:
+ /* check for division by zero */
+ if (ftest->k == 0)
+ return -EINVAL;
+ break;
case BPF_S_LD_MEM:
case BPF_S_LDX_MEM:
case BPF_S_ST:
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index c3519c6d1b1..8f82a5cc385 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -76,6 +76,14 @@ static void rfc2863_policy(struct net_device *dev)
}
+void linkwatch_init_dev(struct net_device *dev)
+{
+ /* Handle pre-registration link state changes */
+ if (!netif_carrier_ok(dev) || netif_dormant(dev))
+ rfc2863_policy(dev);
+}
+
+
static bool linkwatch_urgent_event(struct net_device *dev)
{
if (!netif_running(dev))
@@ -120,22 +128,13 @@ static void linkwatch_schedule_work(int urgent)
delay = 0;
/*
- * This is true if we've scheduled it immeditately or if we don't
- * need an immediate execution and it's already pending.
+ * If urgent, schedule immediate execution; otherwise, don't
+ * override the existing timer.
*/
- if (schedule_delayed_work(&linkwatch_work, delay) == !delay)
- return;
-
- /* Don't bother if there is nothing urgent. */
- if (!test_bit(LW_URGENT, &linkwatch_flags))
- return;
-
- /* It's already running which is good enough. */
- if (!__cancel_delayed_work(&linkwatch_work))
- return;
-
- /* Otherwise we reschedule it again for immediate execution. */
- schedule_delayed_work(&linkwatch_work, 0);
+ if (test_bit(LW_URGENT, &linkwatch_flags))
+ mod_delayed_work(system_wq, &linkwatch_work, 0);
+ else
+ schedule_delayed_work(&linkwatch_work, delay);
}
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 117afaf5126..baca771caae 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1545,7 +1545,7 @@ static void neigh_table_init_no_netlink(struct neigh_table *tbl)
panic("cannot allocate neighbour cache hashes");
rwlock_init(&tbl->lock);
- INIT_DELAYED_WORK_DEFERRABLE(&tbl->gc_work, neigh_periodic_work);
+ INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work);
schedule_delayed_work(&tbl->gc_work, tbl->parms.reachable_time);
setup_timer(&tbl->proxy_timer, neigh_proxy_process, (unsigned long)tbl);
skb_queue_head_init_class(&tbl->proxy_queue,
@@ -2102,7 +2102,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
if (tidx < tbl_skip || (family && tbl->family != family))
continue;
- if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).pid,
+ if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, RTM_NEWNEIGHTBL,
NLM_F_MULTI) <= 0)
break;
@@ -2115,7 +2115,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
goto next;
if (neightbl_fill_param_info(skb, tbl, p,
- NETLINK_CB(cb->skb).pid,
+ NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
RTM_NEWNEIGHTBL,
NLM_F_MULTI) <= 0)
@@ -2244,7 +2244,7 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
continue;
if (idx < s_idx)
goto next;
- if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid,
+ if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
RTM_NEWNEIGH,
NLM_F_MULTI) <= 0) {
@@ -2281,7 +2281,7 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
continue;
if (idx < s_idx)
goto next;
- if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid,
+ if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
RTM_NEWNEIGH,
NLM_F_MULTI, tbl) <= 0) {
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 72607174ea5..bcf02f608cb 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -166,9 +166,21 @@ static ssize_t show_duplex(struct device *dev,
if (netif_running(netdev)) {
struct ethtool_cmd cmd;
- if (!__ethtool_get_settings(netdev, &cmd))
- ret = sprintf(buf, "%s\n",
- cmd.duplex ? "full" : "half");
+ if (!__ethtool_get_settings(netdev, &cmd)) {
+ const char *duplex;
+ switch (cmd.duplex) {
+ case DUPLEX_HALF:
+ duplex = "half";
+ break;
+ case DUPLEX_FULL:
+ duplex = "full";
+ break;
+ default:
+ duplex = "unknown";
+ break;
+ }
+ ret = sprintf(buf, "%s\n", duplex);
+ }
}
rtnl_unlock();
return ret;
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index e4ba3e70c17..77a0388fc3b 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -328,7 +328,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) {
struct netdev_queue *txq;
- txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
+ txq = netdev_pick_tx(dev, skb);
/* try until next clock tick */
for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;
@@ -380,6 +380,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
struct udphdr *udph;
struct iphdr *iph;
struct ethhdr *eth;
+ static atomic_t ip_ident;
udp_len = len + sizeof(*udph);
ip_len = udp_len + sizeof(*iph);
@@ -415,7 +416,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
put_unaligned(0x45, (unsigned char *)iph);
iph->tos = 0;
put_unaligned(htons(ip_len), &(iph->tot_len));
- iph->id = 0;
+ iph->id = htons(atomic_inc_return(&ip_ident));
iph->frag_off = 0;
iph->ttl = 64;
iph->protocol = IPPROTO_UDP;
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index c75e3f9d060..79285a36035 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -73,7 +73,6 @@ static int extend_netdev_table(struct net_device *dev, u32 new_len)
((sizeof(u32) * new_len));
struct netprio_map *new_priomap = kzalloc(new_size, GFP_KERNEL);
struct netprio_map *old_priomap;
- int i;
old_priomap = rtnl_dereference(dev->priomap);
@@ -82,10 +81,10 @@ static int extend_netdev_table(struct net_device *dev, u32 new_len)
return -ENOMEM;
}
- for (i = 0;
- old_priomap && (i < old_priomap->priomap_len);
- i++)
- new_priomap->priomap[i] = old_priomap->priomap[i];
+ if (old_priomap)
+ memcpy(new_priomap->priomap, old_priomap->priomap,
+ old_priomap->priomap_len *
+ sizeof(old_priomap->priomap[0]));
new_priomap->priomap_len = new_len;
@@ -109,32 +108,6 @@ static int write_update_netdev_table(struct net_device *dev)
return ret;
}
-static int update_netdev_tables(void)
-{
- int ret = 0;
- struct net_device *dev;
- u32 max_len;
- struct netprio_map *map;
-
- rtnl_lock();
- max_len = atomic_read(&max_prioidx) + 1;
- for_each_netdev(&init_net, dev) {
- map = rtnl_dereference(dev->priomap);
- /*
- * don't allocate priomap if we didn't
- * change net_prio.ifpriomap (map == NULL),
- * this will speed up skb_update_prio.
- */
- if (map && map->priomap_len < max_len) {
- ret = extend_netdev_table(dev, max_len);
- if (ret < 0)
- break;
- }
- }
- rtnl_unlock();
- return ret;
-}
-
static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp)
{
struct cgroup_netprio_state *cs;
@@ -153,12 +126,6 @@ static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp)
goto out;
}
- ret = update_netdev_tables();
- if (ret < 0) {
- put_prioidx(cs->prioidx);
- goto out;
- }
-
return &cs->css;
out:
kfree(cs);
@@ -272,38 +239,24 @@ out_free_devname:
return ret;
}
+static int update_netprio(const void *v, struct file *file, unsigned n)
+{
+ int err;
+ struct socket *sock = sock_from_file(file, &err);
+ if (sock)
+ sock->sk->sk_cgrp_prioidx = (u32)(unsigned long)v;
+ return 0;
+}
+
void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
{
struct task_struct *p;
+ void *v;
cgroup_taskset_for_each(p, cgrp, tset) {
- unsigned int fd;
- struct fdtable *fdt;
- struct files_struct *files;
-
task_lock(p);
- files = p->files;
- if (!files) {
- task_unlock(p);
- continue;
- }
-
- spin_lock(&files->file_lock);
- fdt = files_fdtable(files);
- for (fd = 0; fd < fdt->max_fds; fd++) {
- struct file *file;
- struct socket *sock;
- int err;
-
- file = fcheck_files(files, fd);
- if (!file)
- continue;
-
- sock = sock_from_file(file, &err);
- if (sock)
- sock_update_netprioidx(sock->sk, p);
- }
- spin_unlock(&files->file_lock);
+ v = (void *)(unsigned long)task_netprioidx(p);
+ iterate_fd(p->files, 0, update_netprio, v);
task_unlock(p);
}
}
@@ -326,11 +279,19 @@ struct cgroup_subsys net_prio_subsys = {
.create = cgrp_create,
.destroy = cgrp_destroy,
.attach = net_prio_attach,
-#ifdef CONFIG_NETPRIO_CGROUP
.subsys_id = net_prio_subsys_id,
-#endif
.base_cftypes = ss_files,
- .module = THIS_MODULE
+ .module = THIS_MODULE,
+
+ /*
+ * net_prio has artificial limit on the number of cgroups and
+ * disallows nesting making it impossible to co-mount it with other
+ * hierarchical subsystems. Remove the artificially low PRIOIDX_SZ
+ * limit and properly nest configuration such that children follow
+ * their parents' configurations by default and are allowed to
+ * override and remove the following.
+ */
+ .broken_hierarchy = true,
};
static int netprio_device_event(struct notifier_block *unused,
@@ -366,10 +327,6 @@ static int __init init_cgroup_netprio(void)
ret = cgroup_load_subsys(&net_prio_subsys);
if (ret)
goto out;
-#ifndef CONFIG_NETPRIO_CGROUP
- smp_wmb();
- net_prio_subsys_id = net_prio_subsys.subsys_id;
-#endif
register_netdevice_notifier(&netprio_device_notifier);
@@ -386,11 +343,6 @@ static void __exit exit_cgroup_netprio(void)
cgroup_unload_subsys(&net_prio_subsys);
-#ifndef CONFIG_NETPRIO_CGROUP
- net_prio_subsys_id = -1;
- synchronize_rcu();
-#endif
-
rtnl_lock();
for_each_netdev(&init_net, dev) {
old = rtnl_dereference(dev->priomap);
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 9b570a6a33c..c31d9e8668c 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -15,6 +15,7 @@
#include <linux/random.h>
#include <linux/slab.h>
#include <linux/string.h>
+#include <linux/tcp.h>
#include <linux/vmalloc.h>
#include <net/request_sock.h>
@@ -130,3 +131,97 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
kfree(lopt);
}
+/*
+ * This function is called to set a Fast Open socket's "fastopen_rsk" field
+ * to NULL when a TFO socket no longer needs to access the request_sock.
+ * This happens only after 3WHS has been either completed or aborted (e.g.,
+ * RST is received).
+ *
+ * Before TFO, a child socket is created only after 3WHS is completed,
+ * hence it never needs to access the request_sock. things get a lot more
+ * complex with TFO. A child socket, accepted or not, has to access its
+ * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts,
+ * until 3WHS is either completed or aborted. Afterwards the req will stay
+ * until either the child socket is accepted, or in the rare case when the
+ * listener is closed before the child is accepted.
+ *
+ * In short, a request socket is only freed after BOTH 3WHS has completed
+ * (or aborted) and the child socket has been accepted (or listener closed).
+ * When a child socket is accepted, its corresponding req->sk is set to
+ * NULL since it's no longer needed. More importantly, "req->sk == NULL"
+ * will be used by the code below to determine if a child socket has been
+ * accepted or not, and the check is protected by the fastopenq->lock
+ * described below.
+ *
+ * Note that fastopen_rsk is only accessed from the child socket's context
+ * with its socket lock held. But a request_sock (req) can be accessed by
+ * both its child socket through fastopen_rsk, and a listener socket through
+ * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin
+ * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created.
+ * only in the rare case when both the listener and the child locks are held,
+ * e.g., in inet_csk_listen_stop() do we not need to acquire the lock.
+ * The lock also protects other fields such as fastopenq->qlen, which is
+ * decremented by this function when fastopen_rsk is no longer needed.
+ *
+ * Note that another solution was to simply use the existing socket lock
+ * from the listener. But first socket lock is difficult to use. It is not
+ * a simple spin lock - one must consider sock_owned_by_user() and arrange
+ * to use sk_add_backlog() stuff. But what really makes it infeasible is the
+ * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to
+ * acquire a child's lock while holding listener's socket lock. A corner
+ * case might also exist in tcp_v4_hnd_req() that will trigger this locking
+ * order.
+ *
+ * When a TFO req is created, it needs to sock_hold its listener to prevent
+ * the latter data structure from going away.
+ *
+ * This function also sets "treq->listener" to NULL and unreference listener
+ * socket. treq->listener is used by the listener so it is protected by the
+ * fastopenq->lock in this function.
+ */
+void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
+ bool reset)
+{
+ struct sock *lsk = tcp_rsk(req)->listener;
+ struct fastopen_queue *fastopenq =
+ inet_csk(lsk)->icsk_accept_queue.fastopenq;
+
+ BUG_ON(!spin_is_locked(&sk->sk_lock.slock) && !sock_owned_by_user(sk));
+
+ tcp_sk(sk)->fastopen_rsk = NULL;
+ spin_lock_bh(&fastopenq->lock);
+ fastopenq->qlen--;
+ tcp_rsk(req)->listener = NULL;
+ if (req->sk) /* the child socket hasn't been accepted yet */
+ goto out;
+
+ if (!reset || lsk->sk_state != TCP_LISTEN) {
+ /* If the listener has been closed don't bother with the
+ * special RST handling below.
+ */
+ spin_unlock_bh(&fastopenq->lock);
+ sock_put(lsk);
+ reqsk_free(req);
+ return;
+ }
+ /* Wait for 60secs before removing a req that has triggered RST.
+ * This is a simple defense against TFO spoofing attack - by
+ * counting the req against fastopen.max_qlen, and disabling
+ * TFO when the qlen exceeds max_qlen.
+ *
+ * For more details see CoNext'11 "TCP Fast Open" paper.
+ */
+ req->expires = jiffies + 60*HZ;
+ if (fastopenq->rskq_rst_head == NULL)
+ fastopenq->rskq_rst_head = req;
+ else
+ fastopenq->rskq_rst_tail->dl_next = req;
+
+ req->dl_next = NULL;
+ fastopenq->rskq_rst_tail = req;
+ fastopenq->qlen++;
+out:
+ spin_unlock_bh(&fastopenq->lock);
+ sock_put(lsk);
+ return;
+}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 2c5a0a06c4c..76d4c2c3c89 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -618,7 +618,7 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
long expires, u32 error)
{
struct rta_cacheinfo ci = {
- .rta_lastuse = jiffies_to_clock_t(jiffies - dst->lastuse),
+ .rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse),
.rta_used = dst->__use,
.rta_clntref = atomic_read(&(dst->__refcnt)),
.rta_error = error,
@@ -1081,7 +1081,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
if (idx < s_idx)
goto cont;
if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK,
- NETLINK_CB(cb->skb).pid,
+ NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, 0,
NLM_F_MULTI,
ext_filter_mask) <= 0)
@@ -1812,8 +1812,6 @@ replay:
return -ENODEV;
}
- if (ifm->ifi_index)
- return -EOPNOTSUPP;
if (tb[IFLA_MAP] || tb[IFLA_MASTER] || tb[IFLA_PROTINFO])
return -EOPNOTSUPP;
@@ -1839,10 +1837,14 @@ replay:
return PTR_ERR(dest_net);
dev = rtnl_create_link(net, dest_net, ifname, ops, tb);
-
- if (IS_ERR(dev))
+ if (IS_ERR(dev)) {
err = PTR_ERR(dev);
- else if (ops->newlink)
+ goto out;
+ }
+
+ dev->ifindex = ifm->ifi_index;
+
+ if (ops->newlink)
err = ops->newlink(net, dev, tb, data);
else
err = register_netdevice(dev);
@@ -1897,14 +1899,14 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
if (nskb == NULL)
return -ENOBUFS;
- err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).pid,
+ err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).portid,
nlh->nlmsg_seq, 0, 0, ext_filter_mask);
if (err < 0) {
/* -EMSGSIZE implies BUG in if_nlmsg_size */
WARN_ON(err == -EMSGSIZE);
kfree_skb(nskb);
} else
- err = rtnl_unicast(nskb, net, NETLINK_CB(skb).pid);
+ err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid);
return err;
}
@@ -2088,7 +2090,8 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
(dev->priv_flags & IFF_BRIDGE_PORT)) {
master = dev->master;
- err = master->netdev_ops->ndo_fdb_add(ndm, dev, addr,
+ err = master->netdev_ops->ndo_fdb_add(ndm, tb,
+ dev, addr,
nlh->nlmsg_flags);
if (err)
goto out;
@@ -2098,7 +2101,8 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
/* Embedded bridge, macvlan, and any other device support */
if ((ndm->ndm_flags & NTF_SELF) && dev->netdev_ops->ndo_fdb_add) {
- err = dev->netdev_ops->ndo_fdb_add(ndm, dev, addr,
+ err = dev->netdev_ops->ndo_fdb_add(ndm, tb,
+ dev, addr,
nlh->nlmsg_flags);
if (!err) {
@@ -2178,9 +2182,9 @@ static int nlmsg_populate_fdb(struct sk_buff *skb,
{
struct netdev_hw_addr *ha;
int err;
- u32 pid, seq;
+ u32 portid, seq;
- pid = NETLINK_CB(cb->skb).pid;
+ portid = NETLINK_CB(cb->skb).portid;
seq = cb->nlh->nlmsg_seq;
list_for_each_entry(ha, &list->list, list) {
@@ -2188,7 +2192,7 @@ static int nlmsg_populate_fdb(struct sk_buff *skb,
goto skip;
err = nlmsg_populate_fdb_fill(skb, dev, ha->addr,
- pid, seq, 0, NTF_SELF);
+ portid, seq, 0, NTF_SELF);
if (err < 0)
return err;
skip:
@@ -2356,7 +2360,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
case NETDEV_PRE_TYPE_CHANGE:
case NETDEV_GOING_DOWN:
case NETDEV_UNREGISTER:
- case NETDEV_UNREGISTER_BATCH:
+ case NETDEV_UNREGISTER_FINAL:
case NETDEV_RELEASE:
case NETDEV_JOIN:
break;
@@ -2379,9 +2383,10 @@ static int __net_init rtnetlink_net_init(struct net *net)
.groups = RTNLGRP_MAX,
.input = rtnetlink_rcv,
.cb_mutex = &rtnl_mutex,
+ .flags = NL_CFG_F_NONROOT_RECV,
};
- sk = netlink_kernel_create(net, NETLINK_ROUTE, THIS_MODULE, &cfg);
+ sk = netlink_kernel_create(net, NETLINK_ROUTE, &cfg);
if (!sk)
return -ENOMEM;
net->rtnl = sk;
@@ -2414,7 +2419,6 @@ void __init rtnetlink_init(void)
if (register_pernet_subsys(&rtnetlink_net_ops))
panic("rtnetlink_init: cannot initialize rtnetlink\n");
- netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV);
register_netdevice_notifier(&rtnetlink_dev_notifier);
rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink,
diff --git a/net/core/scm.c b/net/core/scm.c
index 040cebeed45..ab570841a53 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -45,12 +45,17 @@
static __inline__ int scm_check_creds(struct ucred *creds)
{
const struct cred *cred = current_cred();
+ kuid_t uid = make_kuid(cred->user_ns, creds->uid);
+ kgid_t gid = make_kgid(cred->user_ns, creds->gid);
+
+ if (!uid_valid(uid) || !gid_valid(gid))
+ return -EINVAL;
if ((creds->pid == task_tgid_vnr(current) || capable(CAP_SYS_ADMIN)) &&
- ((creds->uid == cred->uid || creds->uid == cred->euid ||
- creds->uid == cred->suid) || capable(CAP_SETUID)) &&
- ((creds->gid == cred->gid || creds->gid == cred->egid ||
- creds->gid == cred->sgid) || capable(CAP_SETGID))) {
+ ((uid_eq(uid, cred->uid) || uid_eq(uid, cred->euid) ||
+ uid_eq(uid, cred->suid)) || capable(CAP_SETUID)) &&
+ ((gid_eq(gid, cred->gid) || gid_eq(gid, cred->egid) ||
+ gid_eq(gid, cred->sgid)) || capable(CAP_SETGID))) {
return 0;
}
return -EPERM;
@@ -149,39 +154,54 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
goto error;
break;
case SCM_CREDENTIALS:
+ {
+ struct ucred creds;
+ kuid_t uid;
+ kgid_t gid;
if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct ucred)))
goto error;
- memcpy(&p->creds, CMSG_DATA(cmsg), sizeof(struct ucred));
- err = scm_check_creds(&p->creds);
+ memcpy(&creds, CMSG_DATA(cmsg), sizeof(struct ucred));
+ err = scm_check_creds(&creds);
if (err)
goto error;
- if (!p->pid || pid_vnr(p->pid) != p->creds.pid) {
+ p->creds.pid = creds.pid;
+ if (!p->pid || pid_vnr(p->pid) != creds.pid) {
struct pid *pid;
err = -ESRCH;
- pid = find_get_pid(p->creds.pid);
+ pid = find_get_pid(creds.pid);
if (!pid)
goto error;
put_pid(p->pid);
p->pid = pid;
}
+ err = -EINVAL;
+ uid = make_kuid(current_user_ns(), creds.uid);
+ gid = make_kgid(current_user_ns(), creds.gid);
+ if (!uid_valid(uid) || !gid_valid(gid))
+ goto error;
+
+ p->creds.uid = uid;
+ p->creds.gid = gid;
+
if (!p->cred ||
- (p->cred->euid != p->creds.uid) ||
- (p->cred->egid != p->creds.gid)) {
+ !uid_eq(p->cred->euid, uid) ||
+ !gid_eq(p->cred->egid, gid)) {
struct cred *cred;
err = -ENOMEM;
cred = prepare_creds();
if (!cred)
goto error;
- cred->uid = cred->euid = p->creds.uid;
- cred->gid = cred->egid = p->creds.gid;
+ cred->uid = cred->euid = uid;
+ cred->gid = cred->egid = gid;
if (p->cred)
put_cred(p->cred);
p->cred = cred;
}
break;
+ }
default:
goto error;
}
@@ -281,11 +301,10 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
break;
}
/* Bump the usage count and install the file. */
- get_file(fp[i]);
sock = sock_from_file(fp[i], &err);
if (sock)
sock_update_netprioidx(sock->sk, current);
- fd_install(new_fd, fp[i]);
+ fd_install(new_fd, get_file(fp[i]));
}
if (i > 0)
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index 99b2596531b..e61a8bb7fce 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -76,6 +76,7 @@ u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
return hash[0];
}
+EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
#endif
#ifdef CONFIG_INET
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index e33ebae519c..cdc28598f4e 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -340,43 +340,57 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
EXPORT_SYMBOL(build_skb);
struct netdev_alloc_cache {
- struct page *page;
- unsigned int offset;
- unsigned int pagecnt_bias;
+ struct page_frag frag;
+ /* we maintain a pagecount bias, so that we dont dirty cache line
+ * containing page->_count every time we allocate a fragment.
+ */
+ unsigned int pagecnt_bias;
};
static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
-#define NETDEV_PAGECNT_BIAS (PAGE_SIZE / SMP_CACHE_BYTES)
+#define NETDEV_FRAG_PAGE_MAX_ORDER get_order(32768)
+#define NETDEV_FRAG_PAGE_MAX_SIZE (PAGE_SIZE << NETDEV_FRAG_PAGE_MAX_ORDER)
+#define NETDEV_PAGECNT_MAX_BIAS NETDEV_FRAG_PAGE_MAX_SIZE
static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
{
struct netdev_alloc_cache *nc;
void *data = NULL;
+ int order;
unsigned long flags;
local_irq_save(flags);
nc = &__get_cpu_var(netdev_alloc_cache);
- if (unlikely(!nc->page)) {
+ if (unlikely(!nc->frag.page)) {
refill:
- nc->page = alloc_page(gfp_mask);
- if (unlikely(!nc->page))
- goto end;
+ for (order = NETDEV_FRAG_PAGE_MAX_ORDER; ;) {
+ gfp_t gfp = gfp_mask;
+
+ if (order)
+ gfp |= __GFP_COMP | __GFP_NOWARN;
+ nc->frag.page = alloc_pages(gfp, order);
+ if (likely(nc->frag.page))
+ break;
+ if (--order < 0)
+ goto end;
+ }
+ nc->frag.size = PAGE_SIZE << order;
recycle:
- atomic_set(&nc->page->_count, NETDEV_PAGECNT_BIAS);
- nc->pagecnt_bias = NETDEV_PAGECNT_BIAS;
- nc->offset = 0;
+ atomic_set(&nc->frag.page->_count, NETDEV_PAGECNT_MAX_BIAS);
+ nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS;
+ nc->frag.offset = 0;
}
- if (nc->offset + fragsz > PAGE_SIZE) {
+ if (nc->frag.offset + fragsz > nc->frag.size) {
/* avoid unnecessary locked operations if possible */
- if ((atomic_read(&nc->page->_count) == nc->pagecnt_bias) ||
- atomic_sub_and_test(nc->pagecnt_bias, &nc->page->_count))
+ if ((atomic_read(&nc->frag.page->_count) == nc->pagecnt_bias) ||
+ atomic_sub_and_test(nc->pagecnt_bias, &nc->frag.page->_count))
goto recycle;
goto refill;
}
- data = page_address(nc->page) + nc->offset;
- nc->offset += fragsz;
+ data = page_address(nc->frag.page) + nc->frag.offset;
+ nc->frag.offset += fragsz;
nc->pagecnt_bias--;
end:
local_irq_restore(flags);
@@ -1655,38 +1669,19 @@ static struct page *linear_to_page(struct page *page, unsigned int *len,
unsigned int *offset,
struct sk_buff *skb, struct sock *sk)
{
- struct page *p = sk->sk_sndmsg_page;
- unsigned int off;
-
- if (!p) {
-new_page:
- p = sk->sk_sndmsg_page = alloc_pages(sk->sk_allocation, 0);
- if (!p)
- return NULL;
+ struct page_frag *pfrag = sk_page_frag(sk);
- off = sk->sk_sndmsg_off = 0;
- /* hold one ref to this page until it's full */
- } else {
- unsigned int mlen;
-
- /* If we are the only user of the page, we can reset offset */
- if (page_count(p) == 1)
- sk->sk_sndmsg_off = 0;
- off = sk->sk_sndmsg_off;
- mlen = PAGE_SIZE - off;
- if (mlen < 64 && mlen < *len) {
- put_page(p);
- goto new_page;
- }
+ if (!sk_page_frag_refill(sk, pfrag))
+ return NULL;
- *len = min_t(unsigned int, *len, mlen);
- }
+ *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset);
- memcpy(page_address(p) + off, page_address(page) + *offset, *len);
- sk->sk_sndmsg_off += *len;
- *offset = off;
+ memcpy(page_address(pfrag->page) + pfrag->offset,
+ page_address(page) + *offset, *len);
+ *offset = pfrag->offset;
+ pfrag->offset += *len;
- return p;
+ return pfrag->page;
}
static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
@@ -3488,8 +3483,7 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
skb_shinfo(from)->nr_frags > MAX_SKB_FRAGS)
return false;
- delta = from->truesize -
- SKB_TRUESIZE(skb_end_pointer(from) - from->head);
+ delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from));
}
WARN_ON_ONCE(delta < len);
diff --git a/net/core/sock.c b/net/core/sock.c
index a6000fbad29..8a146cfcc36 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -326,17 +326,6 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(__sk_backlog_rcv);
-#if defined(CONFIG_CGROUPS)
-#if !defined(CONFIG_NET_CLS_CGROUP)
-int net_cls_subsys_id = -1;
-EXPORT_SYMBOL_GPL(net_cls_subsys_id);
-#endif
-#if !defined(CONFIG_NETPRIO_CGROUP)
-int net_prio_subsys_id = -1;
-EXPORT_SYMBOL_GPL(net_prio_subsys_id);
-#endif
-#endif
-
static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
{
struct timeval tv;
@@ -869,8 +858,8 @@ void cred_to_ucred(struct pid *pid, const struct cred *cred,
if (cred) {
struct user_namespace *current_ns = current_user_ns();
- ucred->uid = from_kuid(current_ns, cred->euid);
- ucred->gid = from_kgid(current_ns, cred->egid);
+ ucred->uid = from_kuid_munged(current_ns, cred->euid);
+ ucred->gid = from_kgid_munged(current_ns, cred->egid);
}
}
EXPORT_SYMBOL_GPL(cred_to_ucred);
@@ -1224,6 +1213,7 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
}
#ifdef CONFIG_CGROUPS
+#if IS_ENABLED(CONFIG_NET_CLS_CGROUP)
void sock_update_classid(struct sock *sk)
{
u32 classid;
@@ -1231,11 +1221,13 @@ void sock_update_classid(struct sock *sk)
rcu_read_lock(); /* doing current task, which cannot vanish. */
classid = task_cls_classid(current);
rcu_read_unlock();
- if (classid && classid != sk->sk_classid)
+ if (classid != sk->sk_classid)
sk->sk_classid = classid;
}
EXPORT_SYMBOL(sock_update_classid);
+#endif
+#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
void sock_update_netprioidx(struct sock *sk, struct task_struct *task)
{
if (in_interrupt())
@@ -1245,6 +1237,7 @@ void sock_update_netprioidx(struct sock *sk, struct task_struct *task)
}
EXPORT_SYMBOL_GPL(sock_update_netprioidx);
#endif
+#endif
/**
* sk_alloc - All socket objects are allocated here
@@ -1465,19 +1458,6 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
}
EXPORT_SYMBOL_GPL(sk_setup_caps);
-void __init sk_init(void)
-{
- if (totalram_pages <= 4096) {
- sysctl_wmem_max = 32767;
- sysctl_rmem_max = 32767;
- sysctl_wmem_default = 32767;
- sysctl_rmem_default = 32767;
- } else if (totalram_pages >= 131072) {
- sysctl_wmem_max = 131071;
- sysctl_rmem_max = 131071;
- }
-}
-
/*
* Simple resource managers for sockets.
*/
@@ -1535,12 +1515,12 @@ void sock_edemux(struct sk_buff *skb)
}
EXPORT_SYMBOL(sock_edemux);
-int sock_i_uid(struct sock *sk)
+kuid_t sock_i_uid(struct sock *sk)
{
- int uid;
+ kuid_t uid;
read_lock_bh(&sk->sk_callback_lock);
- uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
+ uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
read_unlock_bh(&sk->sk_callback_lock);
return uid;
}
@@ -1745,6 +1725,45 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
}
EXPORT_SYMBOL(sock_alloc_send_skb);
+/* On 32bit arches, an skb frag is limited to 2^15 */
+#define SKB_FRAG_PAGE_ORDER get_order(32768)
+
+bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
+{
+ int order;
+
+ if (pfrag->page) {
+ if (atomic_read(&pfrag->page->_count) == 1) {
+ pfrag->offset = 0;
+ return true;
+ }
+ if (pfrag->offset < pfrag->size)
+ return true;
+ put_page(pfrag->page);
+ }
+
+ /* We restrict high order allocations to users that can afford to wait */
+ order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
+
+ do {
+ gfp_t gfp = sk->sk_allocation;
+
+ if (order)
+ gfp |= __GFP_COMP | __GFP_NOWARN;
+ pfrag->page = alloc_pages(gfp, order);
+ if (likely(pfrag->page)) {
+ pfrag->offset = 0;
+ pfrag->size = PAGE_SIZE << order;
+ return true;
+ }
+ } while (--order >= 0);
+
+ sk_enter_memory_pressure(sk);
+ sk_stream_moderate_sndbuf(sk);
+ return false;
+}
+EXPORT_SYMBOL(sk_page_frag_refill);
+
static void __lock_sock(struct sock *sk)
__releases(&sk->sk_lock.slock)
__acquires(&sk->sk_lock.slock)
@@ -2174,8 +2193,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_error_report = sock_def_error_report;
sk->sk_destruct = sock_def_destruct;
- sk->sk_sndmsg_page = NULL;
- sk->sk_sndmsg_off = 0;
+ sk->sk_frag.page = NULL;
+ sk->sk_frag.offset = 0;
sk->sk_peek_off = -1;
sk->sk_peer_pid = NULL;
@@ -2418,6 +2437,12 @@ void sk_common_release(struct sock *sk)
xfrm_sk_free_policy(sk);
sk_refcnt_debug_release(sk);
+
+ if (sk->sk_frag.page) {
+ put_page(sk->sk_frag.page);
+ sk->sk_frag.page = NULL;
+ }
+
sock_put(sk);
}
EXPORT_SYMBOL(sk_common_release);
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index 9d8755e4a7a..602cd637182 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -172,8 +172,7 @@ static int __net_init diag_net_init(struct net *net)
.input = sock_diag_rcv,
};
- net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG,
- THIS_MODULE, &cfg);
+ net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG, &cfg);
return net->diag_nlsk == NULL ? -ENOMEM : 0;
}
diff --git a/net/core/utils.c b/net/core/utils.c
index 39895a65e54..f5613d569c2 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -294,6 +294,26 @@ void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
}
EXPORT_SYMBOL(inet_proto_csum_replace4);
+void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb,
+ const __be32 *from, const __be32 *to,
+ int pseudohdr)
+{
+ __be32 diff[] = {
+ ~from[0], ~from[1], ~from[2], ~from[3],
+ to[0], to[1], to[2], to[3],
+ };
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ *sum = csum_fold(csum_partial(diff, sizeof(diff),
+ ~csum_unfold(*sum)));
+ if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
+ skb->csum = ~csum_partial(diff, sizeof(diff),
+ ~skb->csum);
+ } else if (pseudohdr)
+ *sum = ~csum_fold(csum_partial(diff, sizeof(diff),
+ csum_unfold(*sum)));
+}
+EXPORT_SYMBOL(inet_proto_csum_replace16);
+
int mac_pton(const char *s, u8 *mac)
{
int i;