summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/8021q/vlan.c6
-rw-r--r--net/8021q/vlan_dev.c3
-rw-r--r--net/Makefile2
-rw-r--r--net/atm/atm_misc.c11
-rw-r--r--net/atm/common.c66
-rw-r--r--net/atm/common.h2
-rw-r--r--net/atm/resources.c78
-rw-r--r--net/atm/resources.h3
-rw-r--r--net/bridge/br_if.c1
-rw-r--r--net/bridge/br_netfilter.c19
-rw-r--r--net/core/datagram.c21
-rw-r--r--net/core/dev.c13
-rw-r--r--net/core/filter.c10
-rw-r--r--net/core/netpoll.c18
-rw-r--r--net/core/skbuff.c2
-rw-r--r--net/dccp/ipv4.c2
-rw-r--r--net/dccp/proto.c1
-rw-r--r--net/decnet/af_decnet.c39
-rw-r--r--net/decnet/sysctl_net_decnet.c33
-rw-r--r--net/ieee80211/Kconfig2
-rw-r--r--net/ipv4/af_inet.c7
-rw-r--r--net/ipv4/devinet.c40
-rw-r--r--net/ipv4/fib_frontend.c10
-rw-r--r--net/ipv4/fib_hash.c2
-rw-r--r--net/ipv4/fib_semantics.c2
-rw-r--r--net/ipv4/fib_trie.c3
-rw-r--r--net/ipv4/icmp.c10
-rw-r--r--net/ipv4/igmp.c24
-rw-r--r--net/ipv4/ip_fragment.c40
-rw-r--r--net/ipv4/ip_gre.c19
-rw-r--r--net/ipv4/ip_output.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_conn.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_ctl.c4
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_tcp.c2
-rw-r--r--net/ipv4/netfilter/Kconfig18
-rw-r--r--net/ipv4/netfilter/Makefile3
-rw-r--r--net/ipv4/netfilter/ip_conntrack_amanda.c2
-rw-r--r--net/ipv4/netfilter/ip_conntrack_core.c22
-rw-r--r--net/ipv4/netfilter/ip_conntrack_ftp.c6
-rw-r--r--net/ipv4/netfilter/ip_conntrack_irc.c6
-rw-r--r--net/ipv4/netfilter/ip_conntrack_netlink.c103
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_icmp.c15
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_sctp.c4
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_tcp.c46
-rw-r--r--net/ipv4/netfilter/ip_conntrack_tftp.c4
-rw-r--r--net/ipv4/netfilter/ip_nat_core.c2
-rw-r--r--net/ipv4/netfilter/ip_nat_tftp.c5
-rw-r--r--net/ipv4/netfilter/ip_tables.c2
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c4
-rw-r--r--net/ipv4/netfilter/ipt_recent.c1
-rw-r--r--net/ipv4/proc.c10
-rw-r--r--net/ipv4/route.c5
-rw-r--r--net/ipv4/sysctl_net_ipv4.c8
-rw-r--r--net/ipv4/tcp.c11
-rw-r--r--net/ipv4/tcp_bic.c12
-rw-r--r--net/ipv4/tcp_cong.c40
-rw-r--r--net/ipv4/tcp_highspeed.c11
-rw-r--r--net/ipv4/tcp_htcp.c13
-rw-r--r--net/ipv4/tcp_hybla.c6
-rw-r--r--net/ipv4/tcp_input.c288
-rw-r--r--net/ipv4/tcp_ipv4.c28
-rw-r--r--net/ipv4/tcp_minisocks.c7
-rw-r--r--net/ipv4/tcp_output.c294
-rw-r--r--net/ipv4/tcp_scalable.c14
-rw-r--r--net/ipv4/tcp_timer.c4
-rw-r--r--net/ipv4/tcp_vegas.c44
-rw-r--r--net/ipv4/udp.c7
-rw-r--r--net/ipv4/xfrm4_policy.c1
-rw-r--r--net/ipv6/addrconf.c154
-rw-r--r--net/ipv6/af_inet6.c55
-rw-r--r--net/ipv6/datagram.c2
-rw-r--r--net/ipv6/esp6.c2
-rw-r--r--net/ipv6/exthdrs.c22
-rw-r--r--net/ipv6/icmp.c39
-rw-r--r--net/ipv6/ip6_flowlabel.c16
-rw-r--r--net/ipv6/ip6_output.c3
-rw-r--r--net/ipv6/ipv6_sockglue.c6
-rw-r--r--net/ipv6/mcast.c147
-rw-r--r--net/ipv6/netfilter/Kconfig52
-rw-r--r--net/ipv6/netfilter/ip6_tables.c2
-rw-r--r--net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c8
-rw-r--r--net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c12
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c50
-rw-r--r--net/ipv6/raw.c46
-rw-r--r--net/ipv6/reassembly.c41
-rw-r--r--net/ipv6/route.c21
-rw-r--r--net/ipv6/tcp_ipv6.c39
-rw-r--r--net/ipv6/udp.c29
-rw-r--r--net/ipv6/xfrm6_policy.c1
-rw-r--r--net/llc/af_llc.c5
-rw-r--r--net/llc/llc_c_ac.c20
-rw-r--r--net/netfilter/Kconfig4
-rw-r--r--net/netfilter/nf_conntrack_core.c12
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c36
-rw-r--r--net/netfilter/nf_conntrack_standalone.c2
-rw-r--r--net/netfilter/nfnetlink.c33
-rw-r--r--net/netfilter/nfnetlink_log.c6
-rw-r--r--net/netfilter/nfnetlink_queue.c9
-rw-r--r--net/netlink/af_netlink.c2
-rw-r--r--net/netrom/nr_in.c6
-rw-r--r--net/packet/af_packet.c115
-rw-r--r--net/rxrpc/transport.c15
-rw-r--r--net/sched/Kconfig37
-rw-r--r--net/sched/act_api.c2
-rw-r--r--net/sched/sch_netem.c2
-rw-r--r--net/sctp/associola.c33
-rw-r--r--net/sctp/endpointola.c26
-rw-r--r--net/sctp/input.c20
-rw-r--r--net/sctp/protocol.c6
-rw-r--r--net/sctp/sm_sideeffect.c6
-rw-r--r--net/sctp/sm_statefuns.c22
-rw-r--r--net/sctp/socket.c31
-rw-r--r--net/sctp/sysctl.c8
-rw-r--r--net/sctp/transport.c3
-rw-r--r--net/sctp/ulpevent.c24
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c6
-rw-r--r--net/sunrpc/rpc_pipe.c30
-rw-r--r--net/sunrpc/socklib.c5
-rw-r--r--net/sunrpc/svcsock.c10
-rw-r--r--net/sunrpc/xprtsock.c2
-rw-r--r--net/xfrm/xfrm_policy.c49
-rw-r--r--net/xfrm/xfrm_state.c5
122 files changed, 1728 insertions, 1156 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 91e412b0ab0..67465b65abe 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -753,6 +753,8 @@ static int vlan_ioctl_handler(void __user *arg)
break;
case GET_VLAN_REALDEV_NAME_CMD:
err = vlan_dev_get_realdev_name(args.device1, args.u.device2);
+ if (err)
+ goto out;
if (copy_to_user(arg, &args,
sizeof(struct vlan_ioctl_args))) {
err = -EFAULT;
@@ -761,6 +763,8 @@ static int vlan_ioctl_handler(void __user *arg)
case GET_VLAN_VID_CMD:
err = vlan_dev_get_vid(args.device1, &vid);
+ if (err)
+ goto out;
args.u.VID = vid;
if (copy_to_user(arg, &args,
sizeof(struct vlan_ioctl_args))) {
@@ -774,7 +778,7 @@ static int vlan_ioctl_handler(void __user *arg)
__FUNCTION__, args.cmd);
return -EINVAL;
};
-
+out:
return err;
}
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index b7486488967..f2a8750bbf1 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -165,6 +165,9 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
skb_pull(skb, VLAN_HLEN); /* take off the VLAN header (4 bytes currently) */
+ /* Need to correct hardware checksum */
+ skb_postpull_rcsum(skb, vhdr, VLAN_HLEN);
+
/* Ok, lets check to make sure the device (dev) we
* came in on is what this VLAN is attached to.
*/
diff --git a/net/Makefile b/net/Makefile
index 4aa2f46d2a5..f5141b9d4f3 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -15,8 +15,8 @@ obj-$(CONFIG_NET) += $(tmp-y)
# LLC has to be linked before the files in net/802/
obj-$(CONFIG_LLC) += llc/
obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/
-obj-$(CONFIG_INET) += ipv4/
obj-$(CONFIG_NETFILTER) += netfilter/
+obj-$(CONFIG_INET) += ipv4/
obj-$(CONFIG_XFRM) += xfrm/
obj-$(CONFIG_UNIX) += unix/
ifneq ($(CONFIG_IPV6),)
diff --git a/net/atm/atm_misc.c b/net/atm/atm_misc.c
index 223c7ad5bd0..02cc7e71efe 100644
--- a/net/atm/atm_misc.c
+++ b/net/atm/atm_misc.c
@@ -74,11 +74,14 @@ struct sk_buff *atm_alloc_charge(struct atm_vcc *vcc,int pdu_size,
*/
-int atm_pcr_goal(struct atm_trafprm *tp)
+int atm_pcr_goal(const struct atm_trafprm *tp)
{
- if (tp->pcr && tp->pcr != ATM_MAX_PCR) return -tp->pcr;
- if (tp->min_pcr && !tp->pcr) return tp->min_pcr;
- if (tp->max_pcr != ATM_MAX_PCR) return -tp->max_pcr;
+ if (tp->pcr && tp->pcr != ATM_MAX_PCR)
+ return -tp->pcr;
+ if (tp->min_pcr && !tp->pcr)
+ return tp->min_pcr;
+ if (tp->max_pcr != ATM_MAX_PCR)
+ return -tp->max_pcr;
return 0;
}
diff --git a/net/atm/common.c b/net/atm/common.c
index 63feea49fb1..6656b111cc0 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -221,6 +221,29 @@ void vcc_release_async(struct atm_vcc *vcc, int reply)
EXPORT_SYMBOL(vcc_release_async);
+void atm_dev_release_vccs(struct atm_dev *dev)
+{
+ int i;
+
+ write_lock_irq(&vcc_sklist_lock);
+ for (i = 0; i < VCC_HTABLE_SIZE; i++) {
+ struct hlist_head *head = &vcc_hash[i];
+ struct hlist_node *node, *tmp;
+ struct sock *s;
+ struct atm_vcc *vcc;
+
+ sk_for_each_safe(s, node, tmp, head) {
+ vcc = atm_sk(s);
+ if (vcc->dev == dev) {
+ vcc_release_async(vcc, -EPIPE);
+ sk_del_node_init(s);
+ }
+ }
+ }
+ write_unlock_irq(&vcc_sklist_lock);
+}
+
+
static int adjust_tp(struct atm_trafprm *tp,unsigned char aal)
{
int max_sdu;
@@ -332,12 +355,13 @@ static int __vcc_connect(struct atm_vcc *vcc, struct atm_dev *dev, short vpi,
return -EINVAL;
if (vci > 0 && vci < ATM_NOT_RSV_VCI && !capable(CAP_NET_BIND_SERVICE))
return -EPERM;
- error = 0;
+ error = -ENODEV;
if (!try_module_get(dev->ops->owner))
- return -ENODEV;
+ return error;
vcc->dev = dev;
write_lock_irq(&vcc_sklist_lock);
- if ((error = find_ci(vcc, &vpi, &vci))) {
+ if (test_bit(ATM_DF_REMOVED, &dev->flags) ||
+ (error = find_ci(vcc, &vpi, &vci))) {
write_unlock_irq(&vcc_sklist_lock);
goto fail_module_put;
}
@@ -423,33 +447,23 @@ int vcc_connect(struct socket *sock, int itf, short vpi, int vci)
if (vcc->qos.txtp.traffic_class == ATM_ANYCLASS ||
vcc->qos.rxtp.traffic_class == ATM_ANYCLASS)
return -EINVAL;
- if (itf != ATM_ITF_ANY) {
- dev = atm_dev_lookup(itf);
- if (!dev)
- return -ENODEV;
- error = __vcc_connect(vcc, dev, vpi, vci);
- if (error) {
- atm_dev_put(dev);
- return error;
- }
+ if (likely(itf != ATM_ITF_ANY)) {
+ dev = try_then_request_module(atm_dev_lookup(itf), "atm-device-%d", itf);
} else {
- struct list_head *p, *next;
-
dev = NULL;
- spin_lock(&atm_dev_lock);
- list_for_each_safe(p, next, &atm_devs) {
- dev = list_entry(p, struct atm_dev, dev_list);
+ down(&atm_dev_mutex);
+ if (!list_empty(&atm_devs)) {
+ dev = list_entry(atm_devs.next, struct atm_dev, dev_list);
atm_dev_hold(dev);
- spin_unlock(&atm_dev_lock);
- if (!__vcc_connect(vcc, dev, vpi, vci))
- break;
- atm_dev_put(dev);
- dev = NULL;
- spin_lock(&atm_dev_lock);
}
- spin_unlock(&atm_dev_lock);
- if (!dev)
- return -ENODEV;
+ up(&atm_dev_mutex);
+ }
+ if (!dev)
+ return -ENODEV;
+ error = __vcc_connect(vcc, dev, vpi, vci);
+ if (error) {
+ atm_dev_put(dev);
+ return error;
}
if (vpi == ATM_VPI_UNSPEC || vci == ATM_VCI_UNSPEC)
set_bit(ATM_VF_PARTIAL,&vcc->flags);
diff --git a/net/atm/common.h b/net/atm/common.h
index e49ed41c0e3..4887c317cef 100644
--- a/net/atm/common.h
+++ b/net/atm/common.h
@@ -47,4 +47,6 @@ static inline void atm_proc_exit(void)
/* SVC */
int svc_change_qos(struct atm_vcc *vcc,struct atm_qos *qos);
+void atm_dev_release_vccs(struct atm_dev *dev);
+
#endif
diff --git a/net/atm/resources.c b/net/atm/resources.c
index 415d2615d47..c8c459fcb03 100644
--- a/net/atm/resources.c
+++ b/net/atm/resources.c
@@ -25,7 +25,7 @@
LIST_HEAD(atm_devs);
-DEFINE_SPINLOCK(atm_dev_lock);
+DECLARE_MUTEX(atm_dev_mutex);
static struct atm_dev *__alloc_atm_dev(const char *type)
{
@@ -52,7 +52,7 @@ static struct atm_dev *__atm_dev_lookup(int number)
list_for_each(p, &atm_devs) {
dev = list_entry(p, struct atm_dev, dev_list);
- if ((dev->ops) && (dev->number == number)) {
+ if (dev->number == number) {
atm_dev_hold(dev);
return dev;
}
@@ -64,12 +64,13 @@ struct atm_dev *atm_dev_lookup(int number)
{
struct atm_dev *dev;
- spin_lock(&atm_dev_lock);
+ down(&atm_dev_mutex);
dev = __atm_dev_lookup(number);
- spin_unlock(&atm_dev_lock);
+ up(&atm_dev_mutex);
return dev;
}
+
struct atm_dev *atm_dev_register(const char *type, const struct atmdev_ops *ops,
int number, unsigned long *flags)
{
@@ -81,11 +82,11 @@ struct atm_dev *atm_dev_register(const char *type, const struct atmdev_ops *ops,
type);
return NULL;
}
- spin_lock(&atm_dev_lock);
+ down(&atm_dev_mutex);
if (number != -1) {
if ((inuse = __atm_dev_lookup(number))) {
atm_dev_put(inuse);
- spin_unlock(&atm_dev_lock);
+ up(&atm_dev_mutex);
kfree(dev);
return NULL;
}
@@ -105,19 +106,17 @@ struct atm_dev *atm_dev_register(const char *type, const struct atmdev_ops *ops,
memset(&dev->flags, 0, sizeof(dev->flags));
memset(&dev->stats, 0, sizeof(dev->stats));
atomic_set(&dev->refcnt, 1);
- list_add_tail(&dev->dev_list, &atm_devs);
- spin_unlock(&atm_dev_lock);
if (atm_proc_dev_register(dev) < 0) {
printk(KERN_ERR "atm_dev_register: "
"atm_proc_dev_register failed for dev %s\n",
type);
- spin_lock(&atm_dev_lock);
- list_del(&dev->dev_list);
- spin_unlock(&atm_dev_lock);
+ up(&atm_dev_mutex);
kfree(dev);
return NULL;
}
+ list_add_tail(&dev->dev_list, &atm_devs);
+ up(&atm_dev_mutex);
return dev;
}
@@ -125,37 +124,22 @@ struct atm_dev *atm_dev_register(const char *type, const struct atmdev_ops *ops,
void atm_dev_deregister(struct atm_dev *dev)
{
- unsigned long warning_time;
+ BUG_ON(test_bit(ATM_DF_REMOVED, &dev->flags));
+ set_bit(ATM_DF_REMOVED, &dev->flags);
+
+ /*
+ * if we remove current device from atm_devs list, new device
+ * with same number can appear, such we need deregister proc,
+ * release async all vccs and remove them from vccs list too
+ */
+ down(&atm_dev_mutex);
+ list_del(&dev->dev_list);
+ up(&atm_dev_mutex);
+ atm_dev_release_vccs(dev);
atm_proc_dev_deregister(dev);
- spin_lock(&atm_dev_lock);
- list_del(&dev->dev_list);
- spin_unlock(&atm_dev_lock);
-
- warning_time = jiffies;
- while (atomic_read(&dev->refcnt) != 1) {
- msleep(250);
- if ((jiffies - warning_time) > 10 * HZ) {
- printk(KERN_EMERG "atm_dev_deregister: waiting for "
- "dev %d to become free. Usage count = %d\n",
- dev->number, atomic_read(&dev->refcnt));
- warning_time = jiffies;
- }
- }
-
- kfree(dev);
-}
-
-void shutdown_atm_dev(struct atm_dev *dev)
-{
- if (atomic_read(&dev->refcnt) > 1) {
- set_bit(ATM_DF_CLOSE, &dev->flags);
- return;
- }
- if (dev->ops->dev_close)
- dev->ops->dev_close(dev);
- atm_dev_deregister(dev);
+ atm_dev_put(dev);
}
@@ -211,16 +195,16 @@ int atm_dev_ioctl(unsigned int cmd, void __user *arg)
return -EFAULT;
if (get_user(len, &iobuf->length))
return -EFAULT;
- spin_lock(&atm_dev_lock);
+ down(&atm_dev_mutex);
list_for_each(p, &atm_devs)
size += sizeof(int);
if (size > len) {
- spin_unlock(&atm_dev_lock);
+ up(&atm_dev_mutex);
return -E2BIG;
}
tmp_buf = kmalloc(size, GFP_ATOMIC);
if (!tmp_buf) {
- spin_unlock(&atm_dev_lock);
+ up(&atm_dev_mutex);
return -ENOMEM;
}
tmp_p = tmp_buf;
@@ -228,7 +212,7 @@ int atm_dev_ioctl(unsigned int cmd, void __user *arg)
dev = list_entry(p, struct atm_dev, dev_list);
*tmp_p++ = dev->number;
}
- spin_unlock(&atm_dev_lock);
+ up(&atm_dev_mutex);
error = ((copy_to_user(buf, tmp_buf, size)) ||
put_user(size, &iobuf->length))
? -EFAULT : 0;
@@ -245,7 +229,8 @@ int atm_dev_ioctl(unsigned int cmd, void __user *arg)
if (get_user(number, &sioc->number))
return -EFAULT;
- if (!(dev = atm_dev_lookup(number)))
+ if (!(dev = try_then_request_module(atm_dev_lookup(number),
+ "atm-device-%d", number)))
return -ENODEV;
switch (cmd) {
@@ -414,13 +399,13 @@ static __inline__ void *dev_get_idx(loff_t left)
void *atm_dev_seq_start(struct seq_file *seq, loff_t *pos)
{
- spin_lock(&atm_dev_lock);
+ down(&atm_dev_mutex);
return *pos ? dev_get_idx(*pos) : (void *) 1;
}
void atm_dev_seq_stop(struct seq_file *seq, void *v)
{
- spin_unlock(&atm_dev_lock);
+ up(&atm_dev_mutex);
}
void *atm_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
@@ -434,4 +419,3 @@ void *atm_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
EXPORT_SYMBOL(atm_dev_register);
EXPORT_SYMBOL(atm_dev_deregister);
EXPORT_SYMBOL(atm_dev_lookup);
-EXPORT_SYMBOL(shutdown_atm_dev);
diff --git a/net/atm/resources.h b/net/atm/resources.h
index 12910619dbb..b7fb82a93b4 100644
--- a/net/atm/resources.h
+++ b/net/atm/resources.h
@@ -11,8 +11,7 @@
extern struct list_head atm_devs;
-extern spinlock_t atm_dev_lock;
-
+extern struct semaphore atm_dev_mutex;
int atm_dev_ioctl(unsigned int cmd, void __user *arg);
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index defcf6a8607..975abe254b7 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -366,6 +366,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
spin_lock_bh(&br->lock);
br_stp_recalculate_bridge_id(br);
+ br_features_recompute(br);
if ((br->dev->flags & IFF_UP)
&& (dev->flags & IFF_UP) && netif_carrier_ok(dev))
br_stp_enable_port(p);
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index d8e36b77512..23422bd53a5 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -295,7 +295,7 @@ static int check_hbh_len(struct sk_buff *skb)
len -= 2;
while (len > 0) {
- int optlen = raw[off+1]+2;
+ int optlen = skb->nh.raw[off+1]+2;
switch (skb->nh.raw[off]) {
case IPV6_TLV_PAD0:
@@ -308,18 +308,15 @@ static int check_hbh_len(struct sk_buff *skb)
case IPV6_TLV_JUMBO:
if (skb->nh.raw[off+1] != 4 || (off&3) != 2)
goto bad;
-
pkt_len = ntohl(*(u32*)(skb->nh.raw+off+2));
-
+ if (pkt_len <= IPV6_MAXPLEN ||
+ skb->nh.ipv6h->payload_len)
+ goto bad;
if (pkt_len > skb->len - sizeof(struct ipv6hdr))
goto bad;
- if (pkt_len + sizeof(struct ipv6hdr) < skb->len) {
- if (__pskb_trim(skb,
- pkt_len + sizeof(struct ipv6hdr)))
- goto bad;
- if (skb->ip_summed == CHECKSUM_HW)
- skb->ip_summed = CHECKSUM_NONE;
- }
+ if (pskb_trim_rcsum(skb,
+ pkt_len+sizeof(struct ipv6hdr)))
+ goto bad;
break;
default:
if (optlen > len)
@@ -372,6 +369,7 @@ static unsigned int br_nf_pre_routing_ipv6(unsigned int hook,
if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb))
goto inhdr_error;
+ nf_bridge_put(skb->nf_bridge);
if ((nf_bridge = nf_bridge_alloc(skb)) == NULL)
return NF_DROP;
setup_pre_routing(skb);
@@ -455,6 +453,7 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff **pskb,
skb->ip_summed = CHECKSUM_NONE;
}
+ nf_bridge_put(skb->nf_bridge);
if ((nf_bridge = nf_bridge_alloc(skb)) == NULL)
return NF_DROP;
setup_pre_routing(skb);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index d219435d086..1bcfef51ac5 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -350,6 +350,20 @@ fault:
return -EFAULT;
}
+unsigned int __skb_checksum_complete(struct sk_buff *skb)
+{
+ unsigned int sum;
+
+ sum = (u16)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum));
+ if (likely(!sum)) {
+ if (unlikely(skb->ip_summed == CHECKSUM_HW))
+ netdev_rx_csum_fault(skb->dev);
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+ return sum;
+}
+EXPORT_SYMBOL(__skb_checksum_complete);
+
/**
* skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec.
* @skb: skbuff
@@ -363,7 +377,7 @@ fault:
* -EFAULT - fault during copy. Beware, in this case iovec
* can be modified!
*/
-int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb,
+int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb,
int hlen, struct iovec *iov)
{
unsigned int csum;
@@ -376,8 +390,7 @@ int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb,
iov++;
if (iov->iov_len < chunk) {
- if ((unsigned short)csum_fold(skb_checksum(skb, 0, chunk + hlen,
- skb->csum)))
+ if (__skb_checksum_complete(skb))
goto csum_error;
if (skb_copy_datagram_iovec(skb, hlen, iov, chunk))
goto fault;
@@ -388,6 +401,8 @@ int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb,
goto fault;
if ((unsigned short)csum_fold(csum))
goto csum_error;
+ if (unlikely(skb->ip_summed == CHECKSUM_HW))
+ netdev_rx_csum_fault(skb->dev);
iov->iov_len -= chunk;
iov->iov_base += chunk;
}
diff --git a/net/core/dev.c b/net/core/dev.c
index 8d154159527..a5efc9ae010 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1108,6 +1108,19 @@ out:
return ret;
}
+/* Take action when hardware reception checksum errors are detected. */
+#ifdef CONFIG_BUG
+void netdev_rx_csum_fault(struct net_device *dev)
+{
+ if (net_ratelimit()) {
+ printk(KERN_ERR "%s: hw csum failure.\n",
+ dev ? dev->name : "<unknown>");
+ dump_stack();
+ }
+}
+EXPORT_SYMBOL(netdev_rx_csum_fault);
+#endif
+
#ifdef CONFIG_HIGHMEM
/* Actually, we should eliminate this check as soon as we know, that:
* 1. IOMMU is present and allows to map all the memory.
diff --git a/net/core/filter.c b/net/core/filter.c
index 079c2edff78..3a10e0bc90e 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -116,8 +116,6 @@ int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
A /= X;
continue;
case BPF_ALU|BPF_DIV|BPF_K:
- if (fentry->k == 0)
- return 0;
A /= fentry->k;
continue;
case BPF_ALU|BPF_AND|BPF_X:
@@ -295,7 +293,7 @@ int sk_chk_filter(struct sock_filter *filter, int flen)
struct sock_filter *ftest;
int pc;
- if (((unsigned int)flen >= (~0U / sizeof(struct sock_filter))) || flen == 0)
+ if (flen == 0 || flen > BPF_MAXINSNS)
return -EINVAL;
/* check the filter code now */
@@ -320,6 +318,10 @@ int sk_chk_filter(struct sock_filter *filter, int flen)
}
}
+ /* check for division by zero -Kris Katterjohn 2005-10-30 */
+ if (ftest->code == (BPF_ALU|BPF_DIV|BPF_K) && ftest->k == 0)
+ return -EINVAL;
+
/* check that memory operations use valid addresses. */
if (ftest->k >= BPF_MEMWORDS) {
/* but it might not be a memory operation... */
@@ -358,7 +360,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
int err;
/* Make sure new filter is there and in the right amounts. */
- if (fprog->filter == NULL || fprog->len > BPF_MAXINSNS)
+ if (fprog->filter == NULL)
return -EINVAL;
fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 802fe11efad..49424a42a2c 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -101,16 +101,20 @@ void netpoll_queue(struct sk_buff *skb)
static int checksum_udp(struct sk_buff *skb, struct udphdr *uh,
unsigned short ulen, u32 saddr, u32 daddr)
{
- if (uh->check == 0)
+ unsigned int psum;
+
+ if (uh->check == 0 || skb->ip_summed == CHECKSUM_UNNECESSARY)
return 0;
- if (skb->ip_summed == CHECKSUM_HW)
- return csum_tcpudp_magic(
- saddr, daddr, ulen, IPPROTO_UDP, skb->csum);
+ psum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
+
+ if (skb->ip_summed == CHECKSUM_HW &&
+ !(u16)csum_fold(csum_add(psum, skb->csum)))
+ return 0;
- skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
+ skb->csum = psum;
- return csum_fold(skb_checksum(skb, 0, skb->len, skb->csum));
+ return __skb_checksum_complete(skb);
}
/*
@@ -489,7 +493,7 @@ int __netpoll_rx(struct sk_buff *skb)
if (ulen != len)
goto out;
- if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr) < 0)
+ if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr))
goto out;
if (np->local_ip && np->local_ip != ntohl(iph->daddr))
goto out;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b7d13a4fff4..83fee37de38 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1725,7 +1725,7 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
* of the skb if any page alloc fails user this procedure returns -ENOMEM
*/
int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
- int getfrag(void *from, char *to, int offset,
+ int (*getfrag)(void *from, char *to, int offset,
int len, int odd, struct sk_buff *skb),
void *from, int length)
{
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index ca03521112c..656e13e38cf 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -1251,7 +1251,7 @@ static int dccp_v4_destroy_sock(struct sock *sk)
struct dccp_sock *dp = dccp_sk(sk);
/*
- * DCCP doesn't use sk_qrite_queue, just sk_send_head
+ * DCCP doesn't use sk_write_queue, just sk_send_head
* for retransmissions
*/
if (sk->sk_send_head != NULL) {
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index e0ace7cbb99..8a6b2a9e458 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -46,6 +46,7 @@ atomic_t dccp_orphan_count = ATOMIC_INIT(0);
static struct net_protocol dccp_protocol = {
.handler = dccp_v4_rcv,
.err_handler = dccp_v4_err,
+ .no_policy = 1,
};
const char *dccp_packet_name(const int type)
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 3f25cadccdd..d402e9020c6 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -153,6 +153,7 @@ static struct proto_ops dn_proto_ops;
static DEFINE_RWLOCK(dn_hash_lock);
static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE];
static struct hlist_head dn_wild_sk;
+static atomic_t decnet_memory_allocated;
static int __dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen, int flags);
static int __dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen, int flags);
@@ -446,10 +447,26 @@ static void dn_destruct(struct sock *sk)
dst_release(xchg(&sk->sk_dst_cache, NULL));
}
+static int dn_memory_pressure;
+
+static void dn_enter_memory_pressure(void)
+{
+ if (!dn_memory_pressure) {
+ dn_memory_pressure = 1;
+ }
+}
+
static struct proto dn_proto = {
- .name = "DECNET",
- .owner = THIS_MODULE,
- .obj_size = sizeof(struct dn_sock),
+ .name = "NSP",
+ .owner = THIS_MODULE,
+ .enter_memory_pressure = dn_enter_memory_pressure,
+ .memory_pressure = &dn_memory_pressure,
+ .memory_allocated = &decnet_memory_allocated,
+ .sysctl_mem = sysctl_decnet_mem,
+ .sysctl_wmem = sysctl_decnet_wmem,
+ .sysctl_rmem = sysctl_decnet_rmem,
+ .max_header = DN_MAX_NSP_DATA_HEADER + 64,
+ .obj_size = sizeof(struct dn_sock),
};
static struct sock *dn_alloc_sock(struct socket *sock, gfp_t gfp)
@@ -470,6 +487,8 @@ static struct sock *dn_alloc_sock(struct socket *sock, gfp_t gfp)
sk->sk_family = PF_DECnet;
sk->sk_protocol = 0;
sk->sk_allocation = gfp;
+ sk->sk_sndbuf = sysctl_decnet_wmem[1];
+ sk->sk_rcvbuf = sysctl_decnet_rmem[1];
/* Initialization of DECnet Session Control Port */
scp = DN_SK(sk);
@@ -1664,17 +1683,15 @@ static int dn_recvmsg(struct kiocb *iocb, struct socket *sock,
goto out;
}
- rv = dn_check_state(sk, NULL, 0, &timeo, flags);
- if (rv)
- goto out;
-
if (sk->sk_shutdown & RCV_SHUTDOWN) {
- if (!(flags & MSG_NOSIGNAL))
- send_sig(SIGPIPE, current, 0);
- rv = -EPIPE;
+ rv = 0;
goto out;
}
+ rv = dn_check_state(sk, NULL, 0, &timeo, flags);
+ if (rv)
+ goto out;
+
if (flags & ~(MSG_PEEK|MSG_OOB|MSG_WAITALL|MSG_DONTWAIT|MSG_NOSIGNAL)) {
rv = -EOPNOTSUPP;
goto out;
@@ -1928,6 +1945,8 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock,
if (sk->sk_shutdown & SEND_SHUTDOWN) {
err = -EPIPE;
+ if (!(flags & MSG_NOSIGNAL))
+ send_sig(SIGPIPE, current, 0);
goto out_err;
}
diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c
index 02bca49cb50..0e9d2c57116 100644
--- a/net/decnet/sysctl_net_decnet.c
+++ b/net/decnet/sysctl_net_decnet.c
@@ -10,6 +10,7 @@
*
* Changes:
* Steve Whitehouse - C99 changes and default device handling
+ * Steve Whitehouse - Memory buffer settings, like the tcp ones
*
*/
#include <linux/config.h>
@@ -37,6 +38,11 @@ int decnet_dr_count = 3;
int decnet_log_martians = 1;
int decnet_no_fc_max_cwnd = NSP_MIN_WINDOW;
+/* Reasonable defaults, I hope, based on tcp's defaults */
+int sysctl_decnet_mem[3] = { 768 << 3, 1024 << 3, 1536 << 3 };
+int sysctl_decnet_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
+int sysctl_decnet_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
+
#ifdef CONFIG_SYSCTL
extern int decnet_dst_gc_interval;
static int min_decnet_time_wait[] = { 5 };
@@ -428,6 +434,33 @@ static ctl_table dn_table[] = {
.extra1 = &min_decnet_no_fc_max_cwnd,
.extra2 = &max_decnet_no_fc_max_cwnd
},
+ {
+ .ctl_name = NET_DECNET_MEM,
+ .procname = "decnet_mem",
+ .data = &sysctl_decnet_mem,
+ .maxlen = sizeof(sysctl_decnet_mem),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = NET_DECNET_RMEM,
+ .procname = "decnet_rmem",
+ .data = &sysctl_decnet_rmem,
+ .maxlen = sizeof(sysctl_decnet_rmem),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = NET_DECNET_WMEM,
+ .procname = "decnet_wmem",
+ .data = &sysctl_decnet_wmem,
+ .maxlen = sizeof(sysctl_decnet_wmem),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
{
.ctl_name = NET_DECNET_DEBUG_LEVEL,
.procname = "debug",
diff --git a/net/ieee80211/Kconfig b/net/ieee80211/Kconfig
index 91b16fbf91f..d18ccba3ea9 100644
--- a/net/ieee80211/Kconfig
+++ b/net/ieee80211/Kconfig
@@ -55,7 +55,7 @@ config IEEE80211_CRYPT_CCMP
config IEEE80211_CRYPT_TKIP
tristate "IEEE 802.11i TKIP encryption"
- depends on IEEE80211
+ depends on IEEE80211 && NET_RADIO
select CRYPTO
select CRYPTO_MICHAEL_MIC
---help---
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index eaa150c33b0..d368cf24900 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -228,13 +228,14 @@ static int inet_create(struct socket *sock, int protocol)
unsigned char answer_flags;
char answer_no_check;
int try_loading_module = 0;
- int err = -ESOCKTNOSUPPORT;
+ int err;
sock->state = SS_UNCONNECTED;
/* Look for the requested type/protocol pair. */
answer = NULL;
lookup_protocol:
+ err = -ESOCKTNOSUPPORT;
rcu_read_lock();
list_for_each_rcu(p, &inetsw[sock->type]) {
answer = list_entry(p, struct inet_protosw, list);
@@ -252,6 +253,7 @@ lookup_protocol:
if (IPPROTO_IP == answer->protocol)
break;
}
+ err = -EPROTONOSUPPORT;
answer = NULL;
}
@@ -280,9 +282,6 @@ lookup_protocol:
err = -EPERM;
if (answer->capability > 0 && !capable(answer->capability))
goto out_rcu_unlock;
- err = -EPROTONOSUPPORT;
- if (!protocol)
- goto out_rcu_unlock;
sock->ops = answer->ops;
answer_prot = answer->prot;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 4ec4b2ca6ab..04a6fe3e95a 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -234,7 +234,10 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
int destroy)
{
struct in_ifaddr *promote = NULL;
- struct in_ifaddr *ifa1 = *ifap;
+ struct in_ifaddr *ifa, *ifa1 = *ifap;
+ struct in_ifaddr *last_prim = in_dev->ifa_list;
+ struct in_ifaddr *prev_prom = NULL;
+ int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev);
ASSERT_RTNL();
@@ -243,18 +246,22 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
**/
if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) {
- struct in_ifaddr *ifa;
struct in_ifaddr **ifap1 = &ifa1->ifa_next;
while ((ifa = *ifap1) != NULL) {
+ if (!(ifa->ifa_flags & IFA_F_SECONDARY) &&
+ ifa1->ifa_scope <= ifa->ifa_scope)
+ last_prim = ifa;
+
if (!(ifa->ifa_flags & IFA_F_SECONDARY) ||
ifa1->ifa_mask != ifa->ifa_mask ||
!inet_ifa_match(ifa1->ifa_address, ifa)) {
ifap1 = &ifa->ifa_next;
+ prev_prom = ifa;
continue;
}
- if (!IN_DEV_PROMOTE_SECONDARIES(in_dev)) {
+ if (!do_promote) {
*ifap1 = ifa->ifa_next;
rtmsg_ifa(RTM_DELADDR, ifa);
@@ -283,18 +290,31 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
*/
rtmsg_ifa(RTM_DELADDR, ifa1);
notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
- if (destroy) {
- inet_free_ifa(ifa1);
- if (!in_dev->ifa_list)
- inetdev_destroy(in_dev);
- }
+ if (promote) {
+
+ if (prev_prom) {
+ prev_prom->ifa_next = promote->ifa_next;
+ promote->ifa_next = last_prim->ifa_next;
+ last_prim->ifa_next = promote;
+ }
- if (promote && IN_DEV_PROMOTE_SECONDARIES(in_dev)) {
- /* not sure if we should send a delete notify first? */
promote->ifa_flags &= ~IFA_F_SECONDARY;
rtmsg_ifa(RTM_NEWADDR, promote);
notifier_call_chain(&inetaddr_chain, NETDEV_UP, promote);
+ for (ifa = promote->ifa_next; ifa; ifa = ifa->ifa_next) {
+ if (ifa1->ifa_mask != ifa->ifa_mask ||
+ !inet_ifa_match(ifa1->ifa_address, ifa))
+ continue;
+ fib_add_ifaddr(ifa);
+ }
+
+ }
+ if (destroy) {
+ inet_free_ifa(ifa1);
+
+ if (!in_dev->ifa_list)
+ inetdev_destroy(in_dev);
}
}
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 2267c1fad87..19b1b984d68 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -407,7 +407,7 @@ static void fib_magic(int cmd, int type, u32 dst, int dst_len, struct in_ifaddr
tb->tb_delete(tb, &req.rtm, &rta, &req.nlh, NULL);
}
-static void fib_add_ifaddr(struct in_ifaddr *ifa)
+void fib_add_ifaddr(struct in_ifaddr *ifa)
{
struct in_device *in_dev = ifa->ifa_dev;
struct net_device *dev = in_dev->dev;
@@ -544,12 +544,16 @@ static void nl_fib_input(struct sock *sk, int len)
struct sk_buff *skb = NULL;
struct nlmsghdr *nlh = NULL;
struct fib_result_nl *frn;
- int err;
u32 pid;
struct fib_table *tb;
- skb = skb_recv_datagram(sk, 0, 0, &err);
+ skb = skb_dequeue(&sk->sk_receive_queue);
nlh = (struct nlmsghdr *)skb->data;
+ if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len ||
+ nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn))) {
+ kfree_skb(skb);
+ return;
+ }
frn = (struct fib_result_nl *) NLMSG_DATA(nlh);
tb = fib_get_table(frn->tb_id_in);
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 2a8c9afc369..7ea0209cb16 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -975,7 +975,7 @@ static void fib_seq_stop(struct seq_file *seq, void *v)
static unsigned fib_flag_trans(int type, u32 mask, struct fib_info *fi)
{
- static unsigned type2flags[RTN_MAX + 1] = {
+ static const unsigned type2flags[RTN_MAX + 1] = {
[7] = RTF_REJECT, [8] = RTF_REJECT,
};
unsigned flags = type2flags[type];
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 186f20c4a45..6d2a6ac070e 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -83,7 +83,7 @@ for (nhsel=0; nhsel < 1; nhsel++)
#define endfor_nexthops(fi) }
-static struct
+static const struct
{
int error;
u8 scope;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 66247f38b37..705e3ce86df 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2378,6 +2378,7 @@ static unsigned fib_flag_trans(int type, u32 mask, const struct fib_info *fi)
*/
static int fib_route_seq_show(struct seq_file *seq, void *v)
{
+ const struct fib_trie_iter *iter = seq->private;
struct leaf *l = v;
int i;
char bf[128];
@@ -2389,6 +2390,8 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
return 0;
}
+ if (iter->trie == trie_local)
+ return 0;
if (IS_TNODE(l))
return 0;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 175e093ec56..92e23b2ad4d 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -220,7 +220,7 @@ struct icmp_control {
short error; /* This ICMP is classed as an error message */
};
-static struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
+static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
/*
* The ICMP socket(s). This is the most convenient way to flow control
@@ -934,11 +934,11 @@ int icmp_rcv(struct sk_buff *skb)
case CHECKSUM_HW:
if (!(u16)csum_fold(skb->csum))
break;
- LIMIT_NETDEBUG(KERN_DEBUG "icmp v4 hw csum failure\n");
+ /* fall through */
case CHECKSUM_NONE:
- if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0)))
+ skb->csum = 0;
+ if (__skb_checksum_complete(skb))
goto error;
- default:;
}
if (!pskb_pull(skb, sizeof(struct icmphdr)))
@@ -994,7 +994,7 @@ error:
/*
* This table is the definition of how we handle ICMP.
*/
-static struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
+static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
[ICMP_ECHOREPLY] = {
.output_entry = ICMP_MIB_OUTECHOREPS,
.input_entry = ICMP_MIB_INECHOREPS,
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index c6247fc8406..4a195c724f0 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -872,11 +872,18 @@ int igmp_rcv(struct sk_buff *skb)
return 0;
}
- if (!pskb_may_pull(skb, sizeof(struct igmphdr)) ||
- (u16)csum_fold(skb_checksum(skb, 0, len, 0))) {
- in_dev_put(in_dev);
- kfree_skb(skb);
- return 0;
+ if (!pskb_may_pull(skb, sizeof(struct igmphdr)))
+ goto drop;
+
+ switch (skb->ip_summed) {
+ case CHECKSUM_HW:
+ if (!(u16)csum_fold(skb->csum))
+ break;
+ /* fall through */
+ case CHECKSUM_NONE:
+ skb->csum = 0;
+ if (__skb_checksum_complete(skb))
+ goto drop;
}
ih = skb->h.igmph;
@@ -890,7 +897,10 @@ int igmp_rcv(struct sk_buff *skb)
/* Is it our report looped back? */
if (((struct rtable*)skb->dst)->fl.iif == 0)
break;
- igmp_heard_report(in_dev, ih->group);
+ /* don't rely on MC router hearing unicast reports */
+ if (skb->pkt_type == PACKET_MULTICAST ||
+ skb->pkt_type == PACKET_BROADCAST)
+ igmp_heard_report(in_dev, ih->group);
break;
case IGMP_PIM:
#ifdef CONFIG_IP_PIMSM_V1
@@ -906,6 +916,8 @@ int igmp_rcv(struct sk_buff *skb)
default:
NETDEBUG(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type);
}
+
+drop:
in_dev_put(in_dev);
kfree_skb(skb);
return 0;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index e7d26d9943c..8ce0ce2ee48 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -71,7 +71,7 @@ struct ipfrag_skb_cb
/* Describe an entry in the "incomplete datagrams" queue. */
struct ipq {
- struct ipq *next; /* linked list pointers */
+ struct hlist_node list;
struct list_head lru_list; /* lru list member */
u32 user;
u32 saddr;
@@ -89,7 +89,6 @@ struct ipq {
spinlock_t lock;
atomic_t refcnt;
struct timer_list timer; /* when will this queue expire? */
- struct ipq **pprev;
int iif;
struct timeval stamp;
};
@@ -99,7 +98,7 @@ struct ipq {
#define IPQ_HASHSZ 64
/* Per-bucket lock is easy to add now. */
-static struct ipq *ipq_hash[IPQ_HASHSZ];
+static struct hlist_head ipq_hash[IPQ_HASHSZ];
static DEFINE_RWLOCK(ipfrag_lock);
static u32 ipfrag_hash_rnd;
static LIST_HEAD(ipq_lru_list);
@@ -107,9 +106,7 @@ int ip_frag_nqueues = 0;
static __inline__ void __ipq_unlink(struct ipq *qp)
{
- if(qp->next)
- qp->next->pprev = qp->pprev;
- *qp->pprev = qp->next;
+ hlist_del(&qp->list);
list_del(&qp->lru_list);
ip_frag_nqueues--;
}
@@ -139,27 +136,18 @@ static void ipfrag_secret_rebuild(unsigned long dummy)
get_random_bytes(&ipfrag_hash_rnd, sizeof(u32));
for (i = 0; i < IPQ_HASHSZ; i++) {
struct ipq *q;
+ struct hlist_node *p, *n;
- q = ipq_hash[i];
- while (q) {
- struct ipq *next = q->next;
+ hlist_for_each_entry_safe(q, p, n, &ipq_hash[i], list) {
unsigned int hval = ipqhashfn(q->id, q->saddr,
q->daddr, q->protocol);
if (hval != i) {
- /* Unlink. */
- if (q->next)
- q->next->pprev = q->pprev;
- *q->pprev = q->next;
+ hlist_del(&q->list);
/* Relink to new hash chain. */
- if ((q->next = ipq_hash[hval]) != NULL)
- q->next->pprev = &q->next;
- ipq_hash[hval] = q;
- q->pprev = &ipq_hash[hval];
+ hlist_add_head(&q->list, &ipq_hash[hval]);
}
-
- q = next;
}
}
write_unlock(&ipfrag_lock);
@@ -310,14 +298,16 @@ out:
static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in)
{
struct ipq *qp;
-
+#ifdef CONFIG_SMP
+ struct hlist_node *n;
+#endif
write_lock(&ipfrag_lock);
#ifdef CONFIG_SMP
/* With SMP race we have to recheck hash table, because
* such entry could be created on other cpu, while we
* promoted read lock to write lock.
*/
- for(qp = ipq_hash[hash]; qp; qp = qp->next) {
+ hlist_for_each_entry(qp, n, &ipq_hash[hash], list) {
if(qp->id == qp_in->id &&
qp->saddr == qp_in->saddr &&
qp->daddr == qp_in->daddr &&
@@ -337,10 +327,7 @@ static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in)
atomic_inc(&qp->refcnt);
atomic_inc(&qp->refcnt);
- if((qp->next = ipq_hash[hash]) != NULL)
- qp->next->pprev = &qp->next;
- ipq_hash[hash] = qp;
- qp->pprev = &ipq_hash[hash];
+ hlist_add_head(&qp->list, &ipq_hash[hash]);
INIT_LIST_HEAD(&qp->lru_list);
list_add_tail(&qp->lru_list, &ipq_lru_list);
ip_frag_nqueues++;
@@ -392,9 +379,10 @@ static inline struct ipq *ip_find(struct iphdr *iph, u32 user)
__u8 protocol = iph->protocol;
unsigned int hash = ipqhashfn(id, saddr, daddr, protocol);
struct ipq *qp;
+ struct hlist_node *n;
read_lock(&ipfrag_lock);
- for(qp = ipq_hash[hash]; qp; qp = qp->next) {
+ hlist_for_each_entry(qp, n, &ipq_hash[hash], list) {
if(qp->id == id &&
qp->saddr == saddr &&
qp->daddr == daddr &&
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 896ce3f8f53..46f9d9cf7a5 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -577,15 +577,16 @@ static int ipgre_rcv(struct sk_buff *skb)
goto drop_nolock;
if (flags&GRE_CSUM) {
- if (skb->ip_summed == CHECKSUM_HW) {
+ switch (skb->ip_summed) {
+ case CHECKSUM_HW:
csum = (u16)csum_fold(skb->csum);
- if (csum)
- skb->ip_summed = CHECKSUM_NONE;
- }
- if (skb->ip_summed == CHECKSUM_NONE) {
- skb->csum = skb_checksum(skb, 0, skb->len, 0);
+ if (!csum)
+ break;
+ /* fall through */
+ case CHECKSUM_NONE:
+ skb->csum = 0;
+ csum = __skb_checksum_complete(skb);
skb->ip_summed = CHECKSUM_HW;
- csum = (u16)csum_fold(skb->csum);
}
offset += 4;
}
@@ -617,7 +618,7 @@ static int ipgre_rcv(struct sk_buff *skb)
skb->mac.raw = skb->nh.raw;
skb->nh.raw = __pskb_pull(skb, offset);
- skb_postpull_rcsum(skb, skb->mac.raw, offset);
+ skb_postpull_rcsum(skb, skb->h.raw, offset);
memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
skb->pkt_type = PACKET_HOST;
#ifdef CONFIG_NET_IPGRE_BROADCAST
@@ -1216,7 +1217,7 @@ static int ipgre_tunnel_init(struct net_device *dev)
return 0;
}
-int __init ipgre_fb_tunnel_init(struct net_device *dev)
+static int __init ipgre_fb_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
struct iphdr *iph = &tunnel->parms.iph;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 11c2f68254f..eba64e2bd39 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -690,7 +690,7 @@ csum_page(struct page *page, int offset, int copy)
return csum;
}
-inline int ip_ufo_append_data(struct sock *sk,
+static inline int ip_ufo_append_data(struct sock *sk,
int getfrag(void *from, char *to, int offset, int len,
int odd, struct sk_buff *skb),
void *from, int length, int hh_len, int fragheaderlen,
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index f828fa2eb7d..2a3a8c59c65 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -771,7 +771,7 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
* The drop rate array needs tuning for real environments.
* Called from timer bh only => no locking
*/
- static char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
+ static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
static char todrop_counter[9] = {0};
int i;
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 2d66848e7aa..9bdcf31b760 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -1909,7 +1909,7 @@ static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user))
#define MAX_ARG_LEN SVCDEST_ARG_LEN
-static unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
+static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
[SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN,
[SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN,
[SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN,
@@ -2180,7 +2180,7 @@ __ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2)
-static unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
+static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
[GET_CMDID(IP_VS_SO_GET_VERSION)] = 64,
[GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN,
[GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN,
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
index c19408973c0..0e878fd6215 100644
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -251,7 +251,7 @@ tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
#define TCP_DIR_OUTPUT 4
#define TCP_DIR_INPUT_ONLY 8
-static int tcp_state_off[IP_VS_DIR_LAST] = {
+static const int tcp_state_off[IP_VS_DIR_LAST] = {
[IP_VS_DIR_INPUT] = TCP_DIR_INPUT,
[IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT,
[IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY,
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 9d3c8b5f327..88a60650e6b 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -56,8 +56,8 @@ config IP_NF_CONNTRACK_MARK
instead of the individual packets.
config IP_NF_CONNTRACK_EVENTS
- bool "Connection tracking events"
- depends on IP_NF_CONNTRACK
+ bool "Connection tracking events (EXPERIMENTAL)"
+ depends on EXPERIMENTAL && IP_NF_CONNTRACK
help
If this option is enabled, the connection tracking code will
provide a notifier chain that can be used by other kernel code
@@ -66,8 +66,8 @@ config IP_NF_CONNTRACK_EVENTS
IF unsure, say `N'.
config IP_NF_CONNTRACK_NETLINK
- tristate 'Connection tracking netlink interface'
- depends on IP_NF_CONNTRACK && NETFILTER_NETLINK
+ tristate 'Connection tracking netlink interface (EXPERIMENTAL)'
+ depends on EXPERIMENTAL && IP_NF_CONNTRACK && NETFILTER_NETLINK
depends on IP_NF_CONNTRACK!=y || NETFILTER_NETLINK!=m
help
This option enables support for a netlink-based userspace interface
@@ -440,7 +440,7 @@ config IP_NF_MATCH_COMMENT
config IP_NF_MATCH_CONNMARK
tristate 'Connection mark match support'
depends on IP_NF_IPTABLES
- depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4)
+ depends on (IP_NF_CONNTRACK && IP_NF_CONNTRACK_MARK) || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4)
help
This option adds a `connmark' match, which allows you to match the
connection mark value previously set for the session by `CONNMARK'.
@@ -452,7 +452,7 @@ config IP_NF_MATCH_CONNMARK
config IP_NF_MATCH_CONNBYTES
tristate 'Connection byte/packet counter match support'
depends on IP_NF_IPTABLES
- depends on IP_NF_CT_ACCT || (NF_CT_ACCT && NF_CONNTRACK_IPV4)
+ depends on (IP_NF_CONNTRACK && IP_NF_CT_ACCT) || (NF_CT_ACCT && NF_CONNTRACK_IPV4)
help
This option adds a `connbytes' match, which allows you to match the
number of bytes and/or packets for each direction within a connection.
@@ -767,7 +767,7 @@ config IP_NF_TARGET_TTL
config IP_NF_TARGET_CONNMARK
tristate 'CONNMARK target support'
depends on IP_NF_MANGLE
- depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4)
+ depends on (IP_NF_CONNTRACK && IP_NF_CONNTRACK_MARK) || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4)
help
This option adds a `CONNMARK' target, which allows one to manipulate
the connection mark value. Similar to the MARK target, but
@@ -779,8 +779,8 @@ config IP_NF_TARGET_CONNMARK
config IP_NF_TARGET_CLUSTERIP
tristate "CLUSTERIP target support (EXPERIMENTAL)"
- depends on IP_NF_IPTABLES && EXPERIMENTAL
- depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4)
+ depends on IP_NF_MANGLE && EXPERIMENTAL
+ depends on (IP_NF_CONNTRACK && IP_NF_CONNTRACK_MARK) || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4)
help
The CLUSTERIP target allows you to build load-balancing clusters of
network servers without having a dedicated load-balancing
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 058c48e258f..d0a447e520a 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -12,6 +12,7 @@ ip_nat_pptp-objs := ip_nat_helper_pptp.o ip_nat_proto_gre.o
# connection tracking
obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o
+obj-$(CONFIG_IP_NF_NAT) += ip_nat.o
# conntrack netlink interface
obj-$(CONFIG_IP_NF_CONNTRACK_NETLINK) += ip_conntrack_netlink.o
@@ -41,7 +42,7 @@ obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
# the three instances of ip_tables
obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o
obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o
-obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o ip_nat.o
+obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o
obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
# matches
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index fa3f914117e..e52847fa10f 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -37,7 +37,7 @@ MODULE_LICENSE("GPL");
module_param(master_timeout, int, 0600);
MODULE_PARM_DESC(master_timeout, "timeout for the master connection");
-static char *conns[] = { "DATA ", "MESG ", "INDEX " };
+static const char *conns[] = { "DATA ", "MESG ", "INDEX " };
/* This is slow, but it's simple. --RR */
static char *amanda_buffer;
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index 422ab68ee7f..84c66dbfeda 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -1345,6 +1345,11 @@ static int kill_all(struct ip_conntrack *i, void *data)
return 1;
}
+void ip_conntrack_flush(void)
+{
+ ip_ct_iterate_cleanup(kill_all, NULL);
+}
+
static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
{
if (vmalloced)
@@ -1354,8 +1359,12 @@ static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
get_order(sizeof(struct list_head) * size));
}
-void ip_conntrack_flush()
+/* Mishearing the voices in his head, our hero wonders how he's
+ supposed to kill the mall. */
+void ip_conntrack_cleanup(void)
{
+ ip_ct_attach = NULL;
+
/* This makes sure all current packets have passed through
netfilter framework. Roll on, two-stage module
delete... */
@@ -1363,7 +1372,7 @@ void ip_conntrack_flush()
ip_ct_event_cache_flush();
i_see_dead_people:
- ip_ct_iterate_cleanup(kill_all, NULL);
+ ip_conntrack_flush();
if (atomic_read(&ip_conntrack_count) != 0) {
schedule();
goto i_see_dead_people;
@@ -1371,14 +1380,7 @@ void ip_conntrack_flush()
/* wait until all references to ip_conntrack_untracked are dropped */
while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
schedule();
-}
-/* Mishearing the voices in his head, our hero wonders how he's
- supposed to kill the mall. */
-void ip_conntrack_cleanup(void)
-{
- ip_ct_attach = NULL;
- ip_conntrack_flush();
kmem_cache_destroy(ip_conntrack_cachep);
kmem_cache_destroy(ip_conntrack_expect_cachep);
free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
@@ -1408,7 +1410,7 @@ static struct list_head *alloc_hashtable(int size, int *vmalloced)
return hash;
}
-int set_hashsize(const char *val, struct kernel_param *kp)
+static int set_hashsize(const char *val, struct kernel_param *kp)
{
int i, bucket, hashsize, vmalloced;
int old_vmalloced, old_size;
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
index d77d6b3f5f8..68b173bcda6 100644
--- a/net/ipv4/netfilter/ip_conntrack_ftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_ftp.c
@@ -29,9 +29,9 @@ static char *ftp_buffer;
static DEFINE_SPINLOCK(ip_ftp_lock);
#define MAX_PORTS 8
-static short ports[MAX_PORTS];
+static unsigned short ports[MAX_PORTS];
static int ports_c;
-module_param_array(ports, short, &ports_c, 0400);
+module_param_array(ports, ushort, &ports_c, 0400);
static int loose;
module_param(loose, int, 0600);
@@ -55,7 +55,7 @@ static int try_rfc959(const char *, size_t, u_int32_t [], char);
static int try_eprt(const char *, size_t, u_int32_t [], char);
static int try_epsv_response(const char *, size_t, u_int32_t [], char);
-static struct ftp_search {
+static const struct ftp_search {
enum ip_conntrack_dir dir;
const char *pattern;
size_t plen;
diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c
index 15457415a4f..d7c40421d0d 100644
--- a/net/ipv4/netfilter/ip_conntrack_irc.c
+++ b/net/ipv4/netfilter/ip_conntrack_irc.c
@@ -34,7 +34,7 @@
#include <linux/moduleparam.h>
#define MAX_PORTS 8
-static short ports[MAX_PORTS];
+static unsigned short ports[MAX_PORTS];
static int ports_c;
static int max_dcc_channels = 8;
static unsigned int dcc_timeout = 300;
@@ -52,14 +52,14 @@ EXPORT_SYMBOL_GPL(ip_nat_irc_hook);
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
MODULE_DESCRIPTION("IRC (DCC) connection tracking helper");
MODULE_LICENSE("GPL");
-module_param_array(ports, short, &ports_c, 0400);
+module_param_array(ports, ushort, &ports_c, 0400);
MODULE_PARM_DESC(ports, "port numbers of IRC servers");
module_param(max_dcc_channels, int, 0400);
MODULE_PARM_DESC(max_dcc_channels, "max number of expected DCC channels per IRC session");
module_param(dcc_timeout, int, 0400);
MODULE_PARM_DESC(dcc_timeout, "timeout on for unestablished DCC channels");
-static char *dccprotos[] = { "SEND ", "CHAT ", "MOVE ", "TSEND ", "SCHAT " };
+static const char *dccprotos[] = { "SEND ", "CHAT ", "MOVE ", "TSEND ", "SCHAT " };
#define MINMATCHLEN 5
#if 0
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c
index d2a4fec2286..91fe8f2e38f 100644
--- a/net/ipv4/netfilter/ip_conntrack_netlink.c
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -27,6 +27,7 @@
#include <linux/errno.h>
#include <linux/netlink.h>
#include <linux/spinlock.h>
+#include <linux/interrupt.h>
#include <linux/notifier.h>
#include <linux/netfilter.h>
@@ -59,11 +60,13 @@ ctnetlink_dump_tuples_proto(struct sk_buff *skb,
NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum);
+ /* If no protocol helper is found, this function will return the
+ * generic protocol helper, so proto won't *ever* be NULL */
proto = ip_conntrack_proto_find_get(tuple->dst.protonum);
- if (likely(proto && proto->tuple_to_nfattr)) {
+ if (likely(proto->tuple_to_nfattr))
ret = proto->tuple_to_nfattr(skb, tuple);
- ip_conntrack_proto_put(proto);
- }
+
+ ip_conntrack_proto_put(proto);
return ret;
@@ -128,9 +131,11 @@ ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct ip_conntrack *ct)
struct nfattr *nest_proto;
int ret;
-
- if (!proto || !proto->to_nfattr)
+
+ if (!proto->to_nfattr) {
+ ip_conntrack_proto_put(proto);
return 0;
+ }
nest_proto = NFA_NEST(skb, CTA_PROTOINFO);
@@ -467,7 +472,7 @@ out:
}
#endif
-static const int cta_min_ip[CTA_IP_MAX] = {
+static const size_t cta_min_ip[CTA_IP_MAX] = {
[CTA_IP_V4_SRC-1] = sizeof(u_int32_t),
[CTA_IP_V4_DST-1] = sizeof(u_int32_t),
};
@@ -497,8 +502,8 @@ ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple)
return 0;
}
-static const int cta_min_proto[CTA_PROTO_MAX] = {
- [CTA_PROTO_NUM-1] = sizeof(u_int16_t),
+static const size_t cta_min_proto[CTA_PROTO_MAX] = {
+ [CTA_PROTO_NUM-1] = sizeof(u_int8_t),
[CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t),
[CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t),
[CTA_PROTO_ICMP_TYPE-1] = sizeof(u_int8_t),
@@ -523,14 +528,14 @@ ctnetlink_parse_tuple_proto(struct nfattr *attr,
if (!tb[CTA_PROTO_NUM-1])
return -EINVAL;
- tuple->dst.protonum = *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]);
+ tuple->dst.protonum = *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]);
proto = ip_conntrack_proto_find_get(tuple->dst.protonum);
- if (likely(proto && proto->nfattr_to_tuple)) {
+ if (likely(proto->nfattr_to_tuple))
ret = proto->nfattr_to_tuple(tb, tuple);
- ip_conntrack_proto_put(proto);
- }
+
+ ip_conntrack_proto_put(proto);
return ret;
}
@@ -576,7 +581,7 @@ ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple,
}
#ifdef CONFIG_IP_NF_NAT_NEEDED
-static const int cta_min_protonat[CTA_PROTONAT_MAX] = {
+static const size_t cta_min_protonat[CTA_PROTONAT_MAX] = {
[CTA_PROTONAT_PORT_MIN-1] = sizeof(u_int16_t),
[CTA_PROTONAT_PORT_MAX-1] = sizeof(u_int16_t),
};
@@ -596,8 +601,6 @@ static int ctnetlink_parse_nat_proto(struct nfattr *attr,
return -EINVAL;
npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
- if (!npt)
- return 0;
if (!npt->nfattr_to_range) {
ip_nat_proto_put(npt);
@@ -614,6 +617,11 @@ static int ctnetlink_parse_nat_proto(struct nfattr *attr,
return 0;
}
+static const size_t cta_min_nat[CTA_NAT_MAX] = {
+ [CTA_NAT_MINIP-1] = sizeof(u_int32_t),
+ [CTA_NAT_MAXIP-1] = sizeof(u_int32_t),
+};
+
static inline int
ctnetlink_parse_nat(struct nfattr *cda[],
const struct ip_conntrack *ct, struct ip_nat_range *range)
@@ -627,6 +635,9 @@ ctnetlink_parse_nat(struct nfattr *cda[],
nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]);
+ if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat))
+ return -EINVAL;
+
if (tb[CTA_NAT_MINIP-1])
range->min_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MINIP-1]);
@@ -667,6 +678,14 @@ ctnetlink_parse_help(struct nfattr *attr, char **helper_name)
return 0;
}
+static const size_t cta_min[CTA_MAX] = {
+ [CTA_STATUS-1] = sizeof(u_int32_t),
+ [CTA_TIMEOUT-1] = sizeof(u_int32_t),
+ [CTA_MARK-1] = sizeof(u_int32_t),
+ [CTA_USE-1] = sizeof(u_int32_t),
+ [CTA_ID-1] = sizeof(u_int32_t)
+};
+
static int
ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
@@ -678,6 +697,9 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
DEBUGP("entered %s\n", __FUNCTION__);
+ if (nfattr_bad_size(cda, CTA_MAX, cta_min))
+ return -EINVAL;
+
if (cda[CTA_TUPLE_ORIG-1])
err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG);
else if (cda[CTA_TUPLE_REPLY-1])
@@ -706,11 +728,9 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
return -ENOENT;
}
}
- if (del_timer(&ct->timeout)) {
- ip_conntrack_put(ct);
+ if (del_timer(&ct->timeout))
ct->timeout.function((unsigned long)ct);
- return 0;
- }
+
ip_conntrack_put(ct);
DEBUGP("leaving\n");
@@ -760,6 +780,9 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
return 0;
}
+ if (nfattr_bad_size(cda, CTA_MAX, cta_min))
+ return -EINVAL;
+
if (cda[CTA_TUPLE_ORIG-1])
err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG);
else if (cda[CTA_TUPLE_REPLY-1])
@@ -852,7 +875,7 @@ ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[])
DEBUGP("NAT status: %lu\n",
status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
- if (ip_nat_initialized(ct, hooknum))
+ if (ip_nat_initialized(ct, HOOK2MANIP(hooknum)))
return -EEXIST;
ip_nat_setup_info(ct, &range, hooknum);
@@ -935,8 +958,6 @@ ctnetlink_change_protoinfo(struct ip_conntrack *ct, struct nfattr *cda[])
nfattr_parse_nested(tb, CTA_PROTOINFO_MAX, attr);
proto = ip_conntrack_proto_find_get(npt);
- if (!proto)
- return -EINVAL;
if (proto->from_nfattr)
err = proto->from_nfattr(tb, ct);
@@ -1047,6 +1068,9 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
DEBUGP("entered %s\n", __FUNCTION__);
+ if (nfattr_bad_size(cda, CTA_MAX, cta_min))
+ return -EINVAL;
+
if (cda[CTA_TUPLE_ORIG-1]) {
err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG);
if (err < 0)
@@ -1252,6 +1276,11 @@ out:
return skb->len;
}
+static const size_t cta_min_exp[CTA_EXPECT_MAX] = {
+ [CTA_EXPECT_TIMEOUT-1] = sizeof(u_int32_t),
+ [CTA_EXPECT_ID-1] = sizeof(u_int32_t)
+};
+
static int
ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
@@ -1263,6 +1292,9 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
DEBUGP("entered %s\n", __FUNCTION__);
+ if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp))
+ return -EINVAL;
+
if (nlh->nlmsg_flags & NLM_F_DUMP) {
struct nfgenmsg *msg = NLMSG_DATA(nlh);
u32 rlen;
@@ -1333,6 +1365,9 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
struct ip_conntrack_helper *h;
int err;
+ if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp))
+ return -EINVAL;
+
if (cda[CTA_EXPECT_TUPLE-1]) {
/* delete a single expect by tuple */
err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
@@ -1462,6 +1497,9 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
DEBUGP("entered %s\n", __FUNCTION__);
+ if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp))
+ return -EINVAL;
+
if (!cda[CTA_EXPECT_TUPLE-1]
|| !cda[CTA_EXPECT_MASK-1]
|| !cda[CTA_EXPECT_MASTER-1])
@@ -1504,29 +1542,22 @@ static struct notifier_block ctnl_notifier_exp = {
static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = {
[IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack,
- .attr_count = CTA_MAX,
- .cap_required = CAP_NET_ADMIN },
+ .attr_count = CTA_MAX, },
[IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack,
- .attr_count = CTA_MAX,
- .cap_required = CAP_NET_ADMIN },
+ .attr_count = CTA_MAX, },
[IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack,
- .attr_count = CTA_MAX,
- .cap_required = CAP_NET_ADMIN },
+ .attr_count = CTA_MAX, },
[IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack,
- .attr_count = CTA_MAX,
- .cap_required = CAP_NET_ADMIN },
+ .attr_count = CTA_MAX, },
};
static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = {
[IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect,
- .attr_count = CTA_EXPECT_MAX,
- .cap_required = CAP_NET_ADMIN },
+ .attr_count = CTA_EXPECT_MAX, },
[IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect,
- .attr_count = CTA_EXPECT_MAX,
- .cap_required = CAP_NET_ADMIN },
+ .attr_count = CTA_EXPECT_MAX, },
[IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect,
- .attr_count = CTA_EXPECT_MAX,
- .cap_required = CAP_NET_ADMIN },
+ .attr_count = CTA_EXPECT_MAX, },
};
static struct nfnetlink_subsystem ctnl_subsys = {
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
index 5198f3a1e2c..5f9925db608 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
@@ -13,6 +13,7 @@
#include <linux/in.h>
#include <linux/icmp.h>
#include <linux/seq_file.h>
+#include <linux/skbuff.h>
#include <net/ip.h>
#include <net/checksum.h>
#include <linux/netfilter.h>
@@ -50,7 +51,7 @@ static int icmp_invert_tuple(struct ip_conntrack_tuple *tuple,
const struct ip_conntrack_tuple *orig)
{
/* Add 1; spaces filled with 0. */
- static u_int8_t invmap[]
+ static const u_int8_t invmap[]
= { [ICMP_ECHO] = ICMP_ECHOREPLY + 1,
[ICMP_ECHOREPLY] = ICMP_ECHO + 1,
[ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1,
@@ -109,7 +110,7 @@ static int icmp_packet(struct ip_conntrack *ct,
return NF_ACCEPT;
}
-static u_int8_t valid_new[] = {
+static const u_int8_t valid_new[] = {
[ICMP_ECHO] = 1,
[ICMP_TIMESTAMP] = 1,
[ICMP_INFO_REQUEST] = 1,
@@ -230,19 +231,15 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
case CHECKSUM_HW:
if (!(u16)csum_fold(skb->csum))
break;
- if (LOG_INVALID(IPPROTO_ICMP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
- "ip_ct_icmp: bad HW ICMP checksum ");
- return -NF_ACCEPT;
+ /* fall through */
case CHECKSUM_NONE:
- if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) {
+ skb->csum = 0;
+ if (__skb_checksum_complete(skb)) {
if (LOG_INVALID(IPPROTO_ICMP))
nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_icmp: bad ICMP checksum ");
return -NF_ACCEPT;
}
- default:
- break;
}
checksum_skipped:
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
index 59a4a0111dd..977fb59d456 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
@@ -65,7 +65,7 @@ static unsigned long ip_ct_sctp_timeout_shutdown_sent = 300 SECS / 1000;
static unsigned long ip_ct_sctp_timeout_shutdown_recd = 300 SECS / 1000;
static unsigned long ip_ct_sctp_timeout_shutdown_ack_sent = 3 SECS;
-static unsigned long * sctp_timeouts[]
+static const unsigned long * sctp_timeouts[]
= { NULL, /* SCTP_CONNTRACK_NONE */
&ip_ct_sctp_timeout_closed, /* SCTP_CONNTRACK_CLOSED */
&ip_ct_sctp_timeout_cookie_wait, /* SCTP_CONNTRACK_COOKIE_WAIT */
@@ -118,7 +118,7 @@ cookie echoed to closed.
*/
/* SCTP conntrack state transitions */
-static enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = {
+static const enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = {
{
/* ORIGINAL */
/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index 468c6003b4c..e7fa29e576d 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -99,7 +99,7 @@ unsigned long ip_ct_tcp_timeout_close = 10 SECS;
to ~13-30min depending on RTO. */
unsigned long ip_ct_tcp_timeout_max_retrans = 5 MINS;
-static unsigned long * tcp_timeouts[]
+static const unsigned long * tcp_timeouts[]
= { NULL, /* TCP_CONNTRACK_NONE */
&ip_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */
&ip_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */
@@ -170,7 +170,7 @@ enum tcp_bit_set {
* if they are invalid
* or we do not support the request (simultaneous open)
*/
-static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
+static const enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
{
/* ORIGINAL */
/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
@@ -272,9 +272,9 @@ static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
* sCL -> sCL
*/
/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
-/*ack*/ { sIV, sIV, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV },
+/*ack*/ { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV },
/*
- * sSS -> sIV Might be a half-open connection.
+ * sSS -> sIG Might be a half-open connection.
* sSR -> sSR Might answer late resent SYN.
* sES -> sES :-)
* sFW -> sCW Normal close request answered by ACK.
@@ -341,9 +341,10 @@ static int tcp_print_conntrack(struct seq_file *s,
static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa,
const struct ip_conntrack *ct)
{
- struct nfattr *nest_parms = NFA_NEST(skb, CTA_PROTOINFO_TCP);
+ struct nfattr *nest_parms;
read_lock_bh(&tcp_lock);
+ nest_parms = NFA_NEST(skb, CTA_PROTOINFO_TCP);
NFA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t),
&ct->proto.tcp.state);
read_unlock_bh(&tcp_lock);
@@ -357,6 +358,10 @@ nfattr_failure:
return -1;
}
+static const size_t cta_min_tcp[CTA_PROTOINFO_TCP_MAX] = {
+ [CTA_PROTOINFO_TCP_STATE-1] = sizeof(u_int8_t),
+};
+
static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct)
{
struct nfattr *attr = cda[CTA_PROTOINFO_TCP-1];
@@ -369,6 +374,9 @@ static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct)
nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr);
+ if (nfattr_bad_size(tb, CTA_PROTOINFO_TCP_MAX, cta_min_tcp))
+ return -EINVAL;
+
if (!tb[CTA_PROTOINFO_TCP_STATE-1])
return -EINVAL;
@@ -810,10 +818,11 @@ void ip_conntrack_tcp_update(struct sk_buff *skb,
#define TH_CWR 0x80
/* table of valid flag combinations - ECE and CWR are always valid */
-static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] =
+static const u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] =
{
[TH_SYN] = 1,
[TH_SYN|TH_ACK] = 1,
+ [TH_SYN|TH_PUSH] = 1,
[TH_SYN|TH_ACK|TH_PUSH] = 1,
[TH_RST] = 1,
[TH_RST|TH_ACK] = 1,
@@ -909,8 +918,12 @@ static int tcp_packet(struct ip_conntrack *conntrack,
switch (new_state) {
case TCP_CONNTRACK_IGNORE:
- /* Either SYN in ORIGINAL
- * or SYN/ACK in REPLY. */
+ /* Ignored packets:
+ *
+ * a) SYN in ORIGINAL
+ * b) SYN/ACK in REPLY
+ * c) ACK in reply direction after initial SYN in original.
+ */
if (index == TCP_SYNACK_SET
&& conntrack->proto.tcp.last_index == TCP_SYN_SET
&& conntrack->proto.tcp.last_dir != dir
@@ -977,13 +990,20 @@ static int tcp_packet(struct ip_conntrack *conntrack,
}
case TCP_CONNTRACK_CLOSE:
if (index == TCP_RST_SET
- && test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)
- && conntrack->proto.tcp.last_index == TCP_SYN_SET
+ && ((test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)
+ && conntrack->proto.tcp.last_index == TCP_SYN_SET)
+ || (!test_bit(IPS_ASSURED_BIT, &conntrack->status)
+ && conntrack->proto.tcp.last_index == TCP_ACK_SET))
&& ntohl(th->ack_seq) == conntrack->proto.tcp.last_end) {
- /* RST sent to invalid SYN we had let trough
- * SYN was in window then, tear down connection.
+ /* RST sent to invalid SYN or ACK we had let trough
+ * at a) and c) above:
+ *
+ * a) SYN was in window then
+ * c) we hold a half-open connection.
+ *
+ * Delete our connection entry.
* We skip window checking, because packet might ACK
- * segments we ignored in the SYN. */
+ * segments we ignored. */
goto in_window;
}
/* Just fall trough */
diff --git a/net/ipv4/netfilter/ip_conntrack_tftp.c b/net/ipv4/netfilter/ip_conntrack_tftp.c
index a78736b8525..d3c5a371f99 100644
--- a/net/ipv4/netfilter/ip_conntrack_tftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_tftp.c
@@ -26,9 +26,9 @@ MODULE_DESCRIPTION("tftp connection tracking helper");
MODULE_LICENSE("GPL");
#define MAX_PORTS 8
-static short ports[MAX_PORTS];
+static unsigned short ports[MAX_PORTS];
static int ports_c;
-module_param_array(ports, short, &ports_c, 0400);
+module_param_array(ports, ushort, &ports_c, 0400);
MODULE_PARM_DESC(ports, "port numbers of tftp servers");
#if 0
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
index 762f4d93936..c1a61462507 100644
--- a/net/ipv4/netfilter/ip_nat_core.c
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -49,7 +49,7 @@ static unsigned int ip_nat_htable_size;
static struct list_head *bysource;
#define MAX_IP_NAT_PROTO 256
-struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO];
+static struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO];
static inline struct ip_nat_protocol *
__ip_nat_proto_find(u_int8_t protonum)
diff --git a/net/ipv4/netfilter/ip_nat_tftp.c b/net/ipv4/netfilter/ip_nat_tftp.c
index 2215317c76b..43c3bd7c118 100644
--- a/net/ipv4/netfilter/ip_nat_tftp.c
+++ b/net/ipv4/netfilter/ip_nat_tftp.c
@@ -42,7 +42,10 @@ static unsigned int help(struct sk_buff **pskb,
enum ip_conntrack_info ctinfo,
struct ip_conntrack_expect *exp)
{
- exp->saved_proto.udp.port = exp->tuple.dst.u.tcp.port;
+ struct ip_conntrack *ct = exp->master;
+
+ exp->saved_proto.udp.port
+ = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port;
exp->dir = IP_CT_DIR_REPLY;
exp->expectfn = ip_nat_follow_master;
if (ip_conntrack_expect_related(exp) != 0)
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 75c27e92f6a..45886c8475e 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1892,7 +1892,7 @@ static int ipt_get_matches(char *buffer, char **start, off_t offset, int length)
return pos;
}
-static struct { char *name; get_info_t *get_info; } ipt_proc_entry[] =
+static const struct { char *name; get_info_t *get_info; } ipt_proc_entry[] =
{ { "ip_tables_names", ipt_get_tables },
{ "ip_tables_targets", ipt_get_targets },
{ "ip_tables_matches", ipt_get_matches },
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index 92ed050fac6..30be0f1dae3 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -197,7 +197,7 @@ static void dump_packet(const struct nf_loginfo *info,
}
case IPPROTO_ICMP: {
struct icmphdr _icmph, *ich;
- static size_t required_len[NR_ICMP_TYPES+1]
+ static const size_t required_len[NR_ICMP_TYPES+1]
= { [ICMP_ECHOREPLY] = 4,
[ICMP_DEST_UNREACH]
= 8 + sizeof(struct iphdr),
@@ -351,7 +351,7 @@ static void dump_packet(const struct nf_loginfo *info,
/* maxlen = 230+ 91 + 230 + 252 = 803 */
}
-struct nf_loginfo default_loginfo = {
+static struct nf_loginfo default_loginfo = {
.type = NF_LOG_TYPE_LOG,
.u = {
.log = {
diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c
index 2d44b07688a..261cbb4d4c4 100644
--- a/net/ipv4/netfilter/ipt_recent.c
+++ b/net/ipv4/netfilter/ipt_recent.c
@@ -532,6 +532,7 @@ match(const struct sk_buff *skb,
}
if(info->seconds && info->hit_count) {
for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) {
+ if(r_list[location].last_pkts[pkt_count] == 0) break;
if(time_before_eq(now,r_list[location].last_pkts[pkt_count]+info->seconds*HZ)) hits_found++;
}
if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert;
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index a65e508fbd4..0d7dc668db4 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -98,7 +98,7 @@ fold_field(void *mib[], int offt)
}
/* snmp items */
-static struct snmp_mib snmp4_ipstats_list[] = {
+static const struct snmp_mib snmp4_ipstats_list[] = {
SNMP_MIB_ITEM("InReceives", IPSTATS_MIB_INRECEIVES),
SNMP_MIB_ITEM("InHdrErrors", IPSTATS_MIB_INHDRERRORS),
SNMP_MIB_ITEM("InAddrErrors", IPSTATS_MIB_INADDRERRORS),
@@ -119,7 +119,7 @@ static struct snmp_mib snmp4_ipstats_list[] = {
SNMP_MIB_SENTINEL
};
-static struct snmp_mib snmp4_icmp_list[] = {
+static const struct snmp_mib snmp4_icmp_list[] = {
SNMP_MIB_ITEM("InMsgs", ICMP_MIB_INMSGS),
SNMP_MIB_ITEM("InErrors", ICMP_MIB_INERRORS),
SNMP_MIB_ITEM("InDestUnreachs", ICMP_MIB_INDESTUNREACHS),
@@ -149,7 +149,7 @@ static struct snmp_mib snmp4_icmp_list[] = {
SNMP_MIB_SENTINEL
};
-static struct snmp_mib snmp4_tcp_list[] = {
+static const struct snmp_mib snmp4_tcp_list[] = {
SNMP_MIB_ITEM("RtoAlgorithm", TCP_MIB_RTOALGORITHM),
SNMP_MIB_ITEM("RtoMin", TCP_MIB_RTOMIN),
SNMP_MIB_ITEM("RtoMax", TCP_MIB_RTOMAX),
@@ -167,7 +167,7 @@ static struct snmp_mib snmp4_tcp_list[] = {
SNMP_MIB_SENTINEL
};
-static struct snmp_mib snmp4_udp_list[] = {
+static const struct snmp_mib snmp4_udp_list[] = {
SNMP_MIB_ITEM("InDatagrams", UDP_MIB_INDATAGRAMS),
SNMP_MIB_ITEM("NoPorts", UDP_MIB_NOPORTS),
SNMP_MIB_ITEM("InErrors", UDP_MIB_INERRORS),
@@ -175,7 +175,7 @@ static struct snmp_mib snmp4_udp_list[] = {
SNMP_MIB_SENTINEL
};
-static struct snmp_mib snmp4_net_list[] = {
+static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("SyncookiesSent", LINUX_MIB_SYNCOOKIESSENT),
SNMP_MIB_ITEM("SyncookiesRecv", LINUX_MIB_SYNCOOKIESRECV),
SNMP_MIB_ITEM("SyncookiesFailed", LINUX_MIB_SYNCOOKIESFAILED),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 381dd6a6aeb..f701a136a6a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1371,7 +1371,7 @@ out: kfree_skb(skb);
* are needed for AMPRnet AX.25 paths.
*/
-static unsigned short mtu_plateau[] =
+static const unsigned short mtu_plateau[] =
{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
@@ -3149,8 +3149,7 @@ int __init ip_rt_init(void)
sizeof(struct rt_hash_bucket),
rhash_entries,
(num_physpages >= 128 * 1024) ?
- (27 - PAGE_SHIFT) :
- (29 - PAGE_SHIFT),
+ 15 : 17,
HASH_HIGHMEM,
&rt_hash_log,
&rt_hash_mask,
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 65268562351..01444a02b48 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -645,6 +645,14 @@ ctl_table ipv4_table[] = {
.proc_handler = &proc_tcp_congestion_control,
.strategy = &sysctl_tcp_congestion_control,
},
+ {
+ .ctl_name = NET_TCP_ABC,
+ .procname = "tcp_abc",
+ .data = &sysctl_tcp_abc,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
{ .ctl_name = 0 }
};
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 72b7c22e1ea..ef98b14ac56 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1413,7 +1413,7 @@ recv_urg:
* closed.
*/
-static unsigned char new_state[16] = {
+static const unsigned char new_state[16] = {
/* current state: new state: action: */
/* (Invalid) */ TCP_CLOSE,
/* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
@@ -1640,7 +1640,7 @@ int tcp_disconnect(struct sock *sk, int flags)
} else if (tcp_need_reset(old_state) ||
(tp->snd_nxt != tp->write_seq &&
(1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
- /* The last check adjusts for discrepance of Linux wrt. RFC
+ /* The last check adjusts for discrepancy of Linux wrt. RFC
* states
*/
tcp_send_active_reset(sk, gfp_any());
@@ -1669,6 +1669,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->packets_out = 0;
tp->snd_ssthresh = 0x7fffffff;
tp->snd_cwnd_cnt = 0;
+ tp->bytes_acked = 0;
tcp_set_ca_state(sk, TCP_CA_Open);
tcp_clear_retrans(tp);
inet_csk_delack_init(sk);
@@ -2064,8 +2065,7 @@ void __init tcp_init(void)
sizeof(struct inet_ehash_bucket),
thash_entries,
(num_physpages >= 128 * 1024) ?
- (25 - PAGE_SHIFT) :
- (27 - PAGE_SHIFT),
+ 13 : 15,
HASH_HIGHMEM,
&tcp_hashinfo.ehash_size,
NULL,
@@ -2081,8 +2081,7 @@ void __init tcp_init(void)
sizeof(struct inet_bind_hashbucket),
tcp_hashinfo.ehash_size,
(num_physpages >= 128 * 1024) ?
- (25 - PAGE_SHIFT) :
- (27 - PAGE_SHIFT),
+ 13 : 15,
HASH_HIGHMEM,
&tcp_hashinfo.bhash_size,
NULL,
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index ae35e060904..1d0cd86621b 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -217,17 +217,15 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack,
bictcp_low_utilization(sk, data_acked);
- if (in_flight < tp->snd_cwnd)
+ if (!tcp_is_cwnd_limited(sk, in_flight))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
- /* In "safe" area, increase. */
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- } else {
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp);
+ else {
bictcp_update(ca, tp->snd_cwnd);
- /* In dangerous area, increase slowly.
+ /* In dangerous area, increase slowly.
* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
*/
if (tp->snd_cwnd_cnt >= ca->cnt) {
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index bbf2d6624e8..c7cc62c8dc1 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -186,24 +186,32 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight,
{
struct tcp_sock *tp = tcp_sk(sk);
- if (in_flight < tp->snd_cwnd)
+ if (!tcp_is_cwnd_limited(sk, in_flight))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
- /* In "safe" area, increase. */
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- } else {
- /* In dangerous area, increase slowly.
- * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
- */
- if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- tp->snd_cwnd_cnt = 0;
- } else
- tp->snd_cwnd_cnt++;
- }
+ /* In "safe" area, increase. */
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp);
+
+ /* In dangerous area, increase slowly. */
+ else if (sysctl_tcp_abc) {
+ /* RFC3465: Apppriate Byte Count
+ * increase once for each full cwnd acked
+ */
+ if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) {
+ tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache;
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
+ }
+ } else {
+ /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */
+ if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
+ tp->snd_cwnd_cnt = 0;
+ } else
+ tp->snd_cwnd_cnt++;
+ }
}
EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 6acc04bde08..63cf7e54084 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -111,18 +111,17 @@ static void hstcp_init(struct sock *sk)
}
static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt,
- u32 in_flight, int good)
+ u32 in_flight, int data_acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct hstcp *ca = inet_csk_ca(sk);
- if (in_flight < tp->snd_cwnd)
+ if (!tcp_is_cwnd_limited(sk, in_flight))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- } else {
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp);
+ else {
/* Update AIMD parameters */
if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) {
while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index e47b37984e9..3284cfb993e 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -207,14 +207,13 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
struct tcp_sock *tp = tcp_sk(sk);
struct htcp *ca = inet_csk_ca(sk);
- if (in_flight < tp->snd_cwnd)
+ if (!tcp_is_cwnd_limited(sk, in_flight))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
- /* In "safe" area, increase. */
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- } else {
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp);
+ else {
+
measure_rtt(sk);
/* keep track of number of round-trip times since last backoff event */
@@ -224,7 +223,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
htcp_alpha_update(ca);
}
- /* In dangerous area, increase slowly.
+ /* In dangerous area, increase slowly.
* In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd
*/
if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) {
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 77add63623d..40dbb387751 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -100,12 +100,12 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
ca->minrtt = tp->srtt;
}
+ if (!tcp_is_cwnd_limited(sk, in_flight))
+ return;
+
if (!ca->hybla_en)
return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag);
- if (in_flight < tp->snd_cwnd)
- return;
-
if (ca->rho == 0)
hybla_recalc_param(sk);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3e98b57578d..bf2e23086bc 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -42,7 +42,7 @@
* Andi Kleen : Moved open_request checking here
* and process RSTs for open_requests.
* Andi Kleen : Better prune_queue, and other fixes.
- * Andrey Savochkin: Fix RTT measurements in the presnce of
+ * Andrey Savochkin: Fix RTT measurements in the presence of
* timestamps.
* Andrey Savochkin: Check sequence numbers correctly when
* removing SACKs due to in sequence incoming
@@ -89,6 +89,7 @@ int sysctl_tcp_frto;
int sysctl_tcp_nometrics_save;
int sysctl_tcp_moderate_rcvbuf = 1;
+int sysctl_tcp_abc = 1;
#define FLAG_DATA 0x01 /* Incoming frame contained data. */
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
@@ -223,7 +224,7 @@ static void tcp_fixup_sndbuf(struct sock *sk)
* of receiver window. Check #2.
*
* The scheme does not work when sender sends good segments opening
- * window and then starts to feed us spagetti. But it should work
+ * window and then starts to feed us spaghetti. But it should work
* in common situations. Otherwise, we have to rely on queue collapsing.
*/
@@ -233,7 +234,7 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp,
{
/* Optimize this! */
int truesize = tcp_win_from_space(skb->truesize)/2;
- int window = tcp_full_space(sk)/2;
+ int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2;
while (tp->rcv_ssthresh <= window) {
if (truesize <= skb->len)
@@ -277,7 +278,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff);
/* Try to select rcvbuf so that 4 mss-sized segments
- * will fit to window and correspoding skbs will fit to our rcvbuf.
+ * will fit to window and corresponding skbs will fit to our rcvbuf.
* (was 3; 4 is minimum to allow fast retransmit to work.)
*/
while (tcp_win_from_space(rcvmem) < tp->advmss)
@@ -286,7 +287,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]);
}
-/* 4. Try to fixup all. It is made iimediately after connection enters
+/* 4. Try to fixup all. It is made immediately after connection enters
* established state.
*/
static void tcp_init_buffer_space(struct sock *sk)
@@ -326,37 +327,18 @@ static void tcp_init_buffer_space(struct sock *sk)
static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- struct sk_buff *skb;
- unsigned int app_win = tp->rcv_nxt - tp->copied_seq;
- int ofo_win = 0;
icsk->icsk_ack.quick = 0;
- skb_queue_walk(&tp->out_of_order_queue, skb) {
- ofo_win += skb->len;
- }
-
- /* If overcommit is due to out of order segments,
- * do not clamp window. Try to expand rcvbuf instead.
- */
- if (ofo_win) {
- if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
- !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
- !tcp_memory_pressure &&
- atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
- sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
- sysctl_tcp_rmem[2]);
+ if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
+ !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
+ !tcp_memory_pressure &&
+ atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
+ sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
+ sysctl_tcp_rmem[2]);
}
- if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
- app_win += ofo_win;
- if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf)
- app_win >>= 1;
- if (app_win > icsk->icsk_ack.rcv_mss)
- app_win -= icsk->icsk_ack.rcv_mss;
- app_win = max(app_win, 2U*tp->advmss);
-
+ if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss);
- }
}
/* Receiver "autotuning" code.
@@ -385,8 +367,8 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
* are stalled on filesystem I/O.
*
* Also, since we are only going for a minimum in the
- * non-timestamp case, we do not smoothe things out
- * else with timestamps disabled convergance takes too
+ * non-timestamp case, we do not smooth things out
+ * else with timestamps disabled convergence takes too
* long.
*/
if (!win_dep) {
@@ -395,7 +377,7 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
} else if (m < new_sample)
new_sample = m << 3;
} else {
- /* No previous mesaure. */
+ /* No previous measure. */
new_sample = m << 3;
}
@@ -524,7 +506,7 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
if (icsk->icsk_ack.ato > icsk->icsk_rto)
icsk->icsk_ack.ato = icsk->icsk_rto;
} else if (m > icsk->icsk_rto) {
- /* Too long gap. Apparently sender falled to
+ /* Too long gap. Apparently sender failed to
* restart window, so that we send ACKs quickly.
*/
tcp_incr_quickack(sk);
@@ -548,10 +530,9 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
* To save cycles in the RFC 1323 implementation it was better to break
* it up into three procedures. -- erics
*/
-static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt)
+static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
{
struct tcp_sock *tp = tcp_sk(sk);
- const struct inet_connection_sock *icsk = inet_csk(sk);
long m = mrtt; /* RTT */
/* The following amusing code comes from Jacobson's
@@ -565,7 +546,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt)
*
* Funny. This algorithm seems to be very broken.
* These formulae increase RTO, when it should be decreased, increase
- * too slowly, when it should be incresed fastly, decrease too fastly
+ * too slowly, when it should be increased quickly, decrease too quickly
* etc. I guess in BSD RTO takes ONE value, so that it is absolutely
* does not matter how to _calculate_ it. Seems, it was trap
* that VJ failed to avoid. 8)
@@ -610,9 +591,6 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt)
tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
tp->rtt_seq = tp->snd_nxt;
}
-
- if (icsk->icsk_ca_ops->rtt_sample)
- icsk->icsk_ca_ops->rtt_sample(sk, *usrtt);
}
/* Calculate rto without backoff. This is the second half of Van Jacobson's
@@ -629,14 +607,14 @@ static inline void tcp_set_rto(struct sock *sk)
* at least by solaris and freebsd. "Erratic ACKs" has _nothing_
* to do with delayed acks, because at cwnd>2 true delack timeout
* is invisible. Actually, Linux-2.4 also generates erratic
- * ACKs in some curcumstances.
+ * ACKs in some circumstances.
*/
inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar;
/* 2. Fixups made earlier cannot be right.
* If we do not estimate RTO correctly without them,
* all the algo is pure shit and should be replaced
- * with correct one. It is exaclty, which we pretend to do.
+ * with correct one. It is exactly, which we pretend to do.
*/
}
@@ -794,7 +772,7 @@ static void tcp_init_metrics(struct sock *sk)
* to make it more realistic.
*
* A bit of theory. RTT is time passed after "normal" sized packet
- * is sent until it is ACKed. In normal curcumstances sending small
+ * is sent until it is ACKed. In normal circumstances sending small
* packets force peer to delay ACKs and calculation is correct too.
* The algorithm is adaptive and, provided we follow specs, it
* NEVER underestimate RTT. BUT! If peer tries to make some clever
@@ -919,18 +897,32 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
int prior_fackets;
u32 lost_retrans = 0;
int flag = 0;
+ int dup_sack = 0;
int i;
if (!tp->sacked_out)
tp->fackets_out = 0;
prior_fackets = tp->fackets_out;
- for (i=0; i<num_sacks; i++, sp++) {
- struct sk_buff *skb;
- __u32 start_seq = ntohl(sp->start_seq);
- __u32 end_seq = ntohl(sp->end_seq);
- int fack_count = 0;
- int dup_sack = 0;
+ /* SACK fastpath:
+ * if the only SACK change is the increase of the end_seq of
+ * the first block then only apply that SACK block
+ * and use retrans queue hinting otherwise slowpath */
+ flag = 1;
+ for (i = 0; i< num_sacks; i++) {
+ __u32 start_seq = ntohl(sp[i].start_seq);
+ __u32 end_seq = ntohl(sp[i].end_seq);
+
+ if (i == 0){
+ if (tp->recv_sack_cache[i].start_seq != start_seq)
+ flag = 0;
+ } else {
+ if ((tp->recv_sack_cache[i].start_seq != start_seq) ||
+ (tp->recv_sack_cache[i].end_seq != end_seq))
+ flag = 0;
+ }
+ tp->recv_sack_cache[i].start_seq = start_seq;
+ tp->recv_sack_cache[i].end_seq = end_seq;
/* Check for D-SACK. */
if (i == 0) {
@@ -962,15 +954,58 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
if (before(ack, prior_snd_una - tp->max_window))
return 0;
}
+ }
+
+ if (flag)
+ num_sacks = 1;
+ else {
+ int j;
+ tp->fastpath_skb_hint = NULL;
+
+ /* order SACK blocks to allow in order walk of the retrans queue */
+ for (i = num_sacks-1; i > 0; i--) {
+ for (j = 0; j < i; j++){
+ if (after(ntohl(sp[j].start_seq),
+ ntohl(sp[j+1].start_seq))){
+ sp[j].start_seq = htonl(tp->recv_sack_cache[j+1].start_seq);
+ sp[j].end_seq = htonl(tp->recv_sack_cache[j+1].end_seq);
+ sp[j+1].start_seq = htonl(tp->recv_sack_cache[j].start_seq);
+ sp[j+1].end_seq = htonl(tp->recv_sack_cache[j].end_seq);
+ }
+
+ }
+ }
+ }
+
+ /* clear flag as used for different purpose in following code */
+ flag = 0;
+
+ for (i=0; i<num_sacks; i++, sp++) {
+ struct sk_buff *skb;
+ __u32 start_seq = ntohl(sp->start_seq);
+ __u32 end_seq = ntohl(sp->end_seq);
+ int fack_count;
+
+ /* Use SACK fastpath hint if valid */
+ if (tp->fastpath_skb_hint) {
+ skb = tp->fastpath_skb_hint;
+ fack_count = tp->fastpath_cnt_hint;
+ } else {
+ skb = sk->sk_write_queue.next;
+ fack_count = 0;
+ }
/* Event "B" in the comment above. */
if (after(end_seq, tp->high_seq))
flag |= FLAG_DATA_LOST;
- sk_stream_for_retrans_queue(skb, sk) {
+ sk_stream_for_retrans_queue_from(skb, sk) {
int in_sack, pcount;
u8 sacked;
+ tp->fastpath_skb_hint = skb;
+ tp->fastpath_cnt_hint = fack_count;
+
/* The retransmission queue is always in order, so
* we can short-circuit the walk early.
*/
@@ -1045,6 +1080,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
tp->lost_out -= tcp_skb_pcount(skb);
tp->retrans_out -= tcp_skb_pcount(skb);
+
+ /* clear lost hint */
+ tp->retransmit_skb_hint = NULL;
}
} else {
/* New sack for not retransmitted frame,
@@ -1057,6 +1095,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
if (sacked & TCPCB_LOST) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
tp->lost_out -= tcp_skb_pcount(skb);
+
+ /* clear lost hint */
+ tp->retransmit_skb_hint = NULL;
}
}
@@ -1080,6 +1121,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
+ tp->retransmit_skb_hint = NULL;
}
}
}
@@ -1107,6 +1149,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
+ /* clear lost hint */
+ tp->retransmit_skb_hint = NULL;
+
if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) {
tp->lost_out += tcp_skb_pcount(skb);
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
@@ -1214,6 +1259,8 @@ static void tcp_enter_frto_loss(struct sock *sk)
tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->frto_highmark;
TCP_ECN_queue_cwr(tp);
+
+ clear_all_retrans_hints(tp);
}
void tcp_clear_retrans(struct tcp_sock *tp)
@@ -1251,6 +1298,7 @@ void tcp_enter_loss(struct sock *sk, int how)
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
+ tp->bytes_acked = 0;
tcp_clear_retrans(tp);
/* Push undo marker, if it was plain RTO and nothing
@@ -1279,6 +1327,8 @@ void tcp_enter_loss(struct sock *sk, int how)
tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->snd_nxt;
TCP_ECN_queue_cwr(tp);
+
+ clear_all_retrans_hints(tp);
}
static int tcp_check_sack_reneging(struct sock *sk)
@@ -1503,17 +1553,37 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp,
int packets, u32 high_seq)
{
struct sk_buff *skb;
- int cnt = packets;
+ int cnt;
- BUG_TRAP(cnt <= tp->packets_out);
+ BUG_TRAP(packets <= tp->packets_out);
+ if (tp->lost_skb_hint) {
+ skb = tp->lost_skb_hint;
+ cnt = tp->lost_cnt_hint;
+ } else {
+ skb = sk->sk_write_queue.next;
+ cnt = 0;
+ }
- sk_stream_for_retrans_queue(skb, sk) {
- cnt -= tcp_skb_pcount(skb);
- if (cnt < 0 || after(TCP_SKB_CB(skb)->end_seq, high_seq))
+ sk_stream_for_retrans_queue_from(skb, sk) {
+ /* TODO: do this better */
+ /* this is not the most efficient way to do this... */
+ tp->lost_skb_hint = skb;
+ tp->lost_cnt_hint = cnt;
+ cnt += tcp_skb_pcount(skb);
+ if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, high_seq))
break;
if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
+
+ /* clear xmit_retransmit_queue hints
+ * if this is beyond hint */
+ if(tp->retransmit_skb_hint != NULL &&
+ before(TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) {
+
+ tp->retransmit_skb_hint = NULL;
+ }
}
}
tcp_sync_left_out(tp);
@@ -1540,13 +1610,28 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
if (tcp_head_timedout(sk, tp)) {
struct sk_buff *skb;
- sk_stream_for_retrans_queue(skb, sk) {
- if (tcp_skb_timedout(sk, skb) &&
- !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
+ skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint
+ : sk->sk_write_queue.next;
+
+ sk_stream_for_retrans_queue_from(skb, sk) {
+ if (!tcp_skb_timedout(sk, skb))
+ break;
+
+ if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
+
+ /* clear xmit_retrans hint */
+ if (tp->retransmit_skb_hint &&
+ before(TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
+
+ tp->retransmit_skb_hint = NULL;
}
}
+
+ tp->scoreboard_skb_hint = skb;
+
tcp_sync_left_out(tp);
}
}
@@ -1626,6 +1711,10 @@ static void tcp_undo_cwr(struct sock *sk, const int undo)
}
tcp_moderate_cwnd(tp);
tp->snd_cwnd_stamp = tcp_time_stamp;
+
+ /* There is something screwy going on with the retrans hints after
+ an undo */
+ clear_all_retrans_hints(tp);
}
static inline int tcp_may_undo(struct tcp_sock *tp)
@@ -1709,6 +1798,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
sk_stream_for_retrans_queue(skb, sk) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
}
+
+ clear_all_retrans_hints(tp);
+
DBGUNDO(sk, tp, "partial loss");
tp->lost_out = 0;
tp->left_out = tp->sacked_out;
@@ -1908,6 +2000,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
TCP_ECN_queue_cwr(tp);
}
+ tp->bytes_acked = 0;
tp->snd_cwnd_cnt = 0;
tcp_set_ca_state(sk, TCP_CA_Recovery);
}
@@ -1919,9 +2012,9 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
}
/* Read draft-ietf-tcplw-high-performance before mucking
- * with this code. (Superceeds RFC1323)
+ * with this code. (Supersedes RFC1323)
*/
-static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag)
+static void tcp_ack_saw_tstamp(struct sock *sk, int flag)
{
/* RTTM Rule: A TSecr value received in a segment is used to
* update the averaged RTT measurement only if the segment
@@ -1932,7 +2025,7 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag)
* 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
*
* Changed: reset backoff as soon as we see the first valid sample.
- * If we do not, we get strongly overstimated rto. With timestamps
+ * If we do not, we get strongly overestimated rto. With timestamps
* samples are accepted even from very old segments: f.e., when rtt=1
* increases to 8, we retransmit 5 times and after 8 seconds delayed
* answer arrives rto becomes 120 seconds! If at least one of segments
@@ -1940,13 +2033,13 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag)
*/
struct tcp_sock *tp = tcp_sk(sk);
const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
- tcp_rtt_estimator(sk, seq_rtt, usrtt);
+ tcp_rtt_estimator(sk, seq_rtt);
tcp_set_rto(sk);
inet_csk(sk)->icsk_backoff = 0;
tcp_bound_rto(sk);
}
-static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag)
+static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag)
{
/* We don't have a timestamp. Can only use
* packets that are not retransmitted to determine
@@ -1960,21 +2053,21 @@ static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag
if (flag & FLAG_RETRANS_DATA_ACKED)
return;
- tcp_rtt_estimator(sk, seq_rtt, usrtt);
+ tcp_rtt_estimator(sk, seq_rtt);
tcp_set_rto(sk);
inet_csk(sk)->icsk_backoff = 0;
tcp_bound_rto(sk);
}
static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
- const s32 seq_rtt, u32 *usrtt)
+ const s32 seq_rtt)
{
const struct tcp_sock *tp = tcp_sk(sk);
/* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
- tcp_ack_saw_tstamp(sk, usrtt, flag);
+ tcp_ack_saw_tstamp(sk, flag);
else if (seq_rtt >= 0)
- tcp_ack_no_tstamp(sk, seq_rtt, usrtt, flag);
+ tcp_ack_no_tstamp(sk, seq_rtt, flag);
}
static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
@@ -2054,20 +2147,27 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
return acked;
}
+static inline u32 tcp_usrtt(const struct sk_buff *skb)
+{
+ struct timeval tv, now;
+
+ do_gettimeofday(&now);
+ skb_get_timestamp(skb, &tv);
+ return (now.tv_sec - tv.tv_sec) * 1000000 + (now.tv_usec - tv.tv_usec);
+}
/* Remove acknowledged frames from the retransmission queue. */
-static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt)
+static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
{
struct tcp_sock *tp = tcp_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb;
__u32 now = tcp_time_stamp;
int acked = 0;
__s32 seq_rtt = -1;
- struct timeval usnow;
u32 pkts_acked = 0;
-
- if (seq_usrtt)
- do_gettimeofday(&usnow);
+ void (*rtt_sample)(struct sock *sk, u32 usrtt)
+ = icsk->icsk_ca_ops->rtt_sample;
while ((skb = skb_peek(&sk->sk_write_queue)) &&
skb != sk->sk_send_head) {
@@ -2107,16 +2207,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
tp->retrans_out -= tcp_skb_pcount(skb);
acked |= FLAG_RETRANS_DATA_ACKED;
seq_rtt = -1;
- } else if (seq_rtt < 0)
+ } else if (seq_rtt < 0) {
seq_rtt = now - scb->when;
- if (seq_usrtt) {
- struct timeval tv;
-
- skb_get_timestamp(skb, &tv);
- *seq_usrtt = (usnow.tv_sec - tv.tv_sec) * 1000000
- + (usnow.tv_usec - tv.tv_usec);
+ if (rtt_sample)
+ (*rtt_sample)(sk, tcp_usrtt(skb));
}
-
if (sacked & TCPCB_SACKED_ACKED)
tp->sacked_out -= tcp_skb_pcount(skb);
if (sacked & TCPCB_LOST)
@@ -2126,17 +2221,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
!before(scb->end_seq, tp->snd_up))
tp->urg_mode = 0;
}
- } else if (seq_rtt < 0)
+ } else if (seq_rtt < 0) {
seq_rtt = now - scb->when;
+ if (rtt_sample)
+ (*rtt_sample)(sk, tcp_usrtt(skb));
+ }
tcp_dec_pcount_approx(&tp->fackets_out, skb);
tcp_packets_out_dec(tp, skb);
__skb_unlink(skb, &sk->sk_write_queue);
sk_stream_free_skb(sk, skb);
+ clear_all_retrans_hints(tp);
}
if (acked&FLAG_ACKED) {
- const struct inet_connection_sock *icsk = inet_csk(sk);
- tcp_ack_update_rtt(sk, acked, seq_rtt, seq_usrtt);
+ tcp_ack_update_rtt(sk, acked, seq_rtt);
tcp_ack_packets_out(sk, tp);
if (icsk->icsk_ca_ops->pkts_acked)
@@ -2284,7 +2382,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
}
/* F-RTO affects on two new ACKs following RTO.
- * At latest on third ACK the TCP behavor is back to normal.
+ * At latest on third ACK the TCP behavior is back to normal.
*/
tp->frto_counter = (tp->frto_counter + 1) % 3;
}
@@ -2299,7 +2397,6 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
u32 ack = TCP_SKB_CB(skb)->ack_seq;
u32 prior_in_flight;
s32 seq_rtt;
- s32 seq_usrtt = 0;
int prior_packets;
/* If the ack is newer than sent or older than previous acks
@@ -2311,6 +2408,9 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
if (before(ack, prior_snd_una))
goto old_ack;
+ if (sysctl_tcp_abc && icsk->icsk_ca_state < TCP_CA_CWR)
+ tp->bytes_acked += ack - prior_snd_una;
+
if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
/* Window is constant, pure forward advance.
* No more checks are required.
@@ -2352,14 +2452,13 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
prior_in_flight = tcp_packets_in_flight(tp);
/* See if we can take anything off of the retransmit queue. */
- flag |= tcp_clean_rtx_queue(sk, &seq_rtt,
- icsk->icsk_ca_ops->rtt_sample ? &seq_usrtt : NULL);
+ flag |= tcp_clean_rtx_queue(sk, &seq_rtt);
if (tp->frto_counter)
tcp_process_frto(sk, prior_snd_una);
if (tcp_ack_is_dubious(sk, flag)) {
- /* Advanve CWND, if state allows this. */
+ /* Advance CWND, if state allows this. */
if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag))
tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0);
tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
@@ -3148,7 +3247,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
{
struct sk_buff *skb;
- /* First, check that queue is collapsable and find
+ /* First, check that queue is collapsible and find
* the point where collapsing can be useful. */
for (skb = head; skb != tail; ) {
/* No new bits? It is possible on ofo queue. */
@@ -3456,7 +3555,7 @@ static __inline__ void tcp_ack_snd_check(struct sock *sk)
/*
* This routine is only called when we have urgent data
- * signalled. Its the 'slow' part of tcp_urg. It could be
+ * signaled. Its the 'slow' part of tcp_urg. It could be
* moved inline now as tcp_urg is only called from one
* place. We handle URGent data wrong. We have to - as
* BSD still doesn't use the correction from RFC961.
@@ -3501,7 +3600,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
* urgent. To do this requires some care. We cannot just ignore
* tp->copied_seq since we would read the last urgent byte again
* as data, nor can we alter copied_seq until this data arrives
- * or we break the sematics of SIOCATMARK (and thus sockatmark())
+ * or we break the semantics of SIOCATMARK (and thus sockatmark())
*
* NOTE. Double Dutch. Rendering to plain English: author of comment
* above did something sort of send("A", MSG_OOB); send("B", MSG_OOB);
@@ -3646,7 +3745,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tp->rx_opt.saw_tstamp = 0;
/* pred_flags is 0xS?10 << 16 + snd_wnd
- * if header_predition is to be made
+ * if header_prediction is to be made
* 'S' will always be tp->tcp_header_len >> 2
* '?' will be 0 for the fast path, otherwise pred_flags is 0 to
* turn it off (when there are holes in the receive
@@ -4242,7 +4341,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
*/
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
!tp->srtt)
- tcp_ack_saw_tstamp(sk, NULL, 0);
+ tcp_ack_saw_tstamp(sk, 0);
if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -4372,6 +4471,7 @@ discard:
EXPORT_SYMBOL(sysctl_tcp_ecn);
EXPORT_SYMBOL(sysctl_tcp_reordering);
+EXPORT_SYMBOL(sysctl_tcp_abc);
EXPORT_SYMBOL(tcp_parse_options);
EXPORT_SYMBOL(tcp_rcv_established);
EXPORT_SYMBOL(tcp_rcv_state_process);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 634dabb558f..4d5021e1929 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -39,7 +39,7 @@
* request_sock handling and moved
* most of it into the af independent code.
* Added tail drop and some other bugfixes.
- * Added new listen sematics.
+ * Added new listen semantics.
* Mike McLagan : Routing by source
* Juan Jose Ciarlante: ip_dynaddr bits
* Andi Kleen: various fixes.
@@ -1110,24 +1110,18 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
static int tcp_v4_checksum_init(struct sk_buff *skb)
{
if (skb->ip_summed == CHECKSUM_HW) {
- skb->ip_summed = CHECKSUM_UNNECESSARY;
if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
- skb->nh.iph->daddr, skb->csum))
+ skb->nh.iph->daddr, skb->csum)) {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
return 0;
-
- LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v4 csum failed\n");
- skb->ip_summed = CHECKSUM_NONE;
+ }
}
+
+ skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr,
+ skb->len, IPPROTO_TCP, 0);
+
if (skb->len <= 76) {
- if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
- skb->nh.iph->daddr,
- skb_checksum(skb, 0, skb->len, 0)))
- return -1;
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- } else {
- skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
- skb->nh.iph->saddr,
- skb->nh.iph->daddr, 0);
+ return __skb_checksum_complete(skb);
}
return 0;
}
@@ -1216,10 +1210,10 @@ int tcp_v4_rcv(struct sk_buff *skb)
/* An explanation is required here, I think.
* Packet length and doff are validated by header prediction,
- * provided case of th->doff==0 is elimineted.
+ * provided case of th->doff==0 is eliminated.
* So, we defer the checks. */
if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
- tcp_v4_checksum_init(skb) < 0))
+ tcp_v4_checksum_init(skb)))
goto bad_packet;
th = skb->h.th;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b1a63b2c6b4..1b66a2ac432 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -158,7 +158,7 @@ kill_with_rst:
/* I am shamed, but failed to make it more elegant.
* Yes, it is direct reference to IP, which is impossible
* to generalize to IPv6. Taking into account that IPv6
- * do not undertsnad recycling in any case, it not
+ * do not understand recycling in any case, it not
* a big problem in practice. --ANK */
if (tw->tw_family == AF_INET &&
tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp &&
@@ -194,7 +194,7 @@ kill_with_rst:
/* In window segment, it may be only reset or bare ack. */
if (th->rst) {
- /* This is TIME_WAIT assasination, in two flavors.
+ /* This is TIME_WAIT assassination, in two flavors.
* Oh well... nobody has a sufficient solution to this
* protocol bug yet.
*/
@@ -380,6 +380,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
*/
newtp->snd_cwnd = 2;
newtp->snd_cwnd_cnt = 0;
+ newtp->bytes_acked = 0;
newtp->frto_counter = 0;
newtp->frto_highmark = 0;
@@ -550,7 +551,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
/* RFC793 page 36: "If the connection is in any non-synchronized state ...
* and the incoming segment acknowledges something not yet
- * sent (the segment carries an unaccaptable ACK) ...
+ * sent (the segment carries an unacceptable ACK) ...
* a reset is sent."
*
* Invalid ACK: reset will be sent by listening socket
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b907456a79f..b7325e0b406 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -262,122 +262,139 @@ static __inline__ u16 tcp_select_window(struct sock *sk)
* We are working here with either a clone of the original
* SKB, or a fresh unique copy made by the retransmit engine.
*/
-static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
+static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask)
{
- if (skb != NULL) {
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct inet_sock *inet = inet_sk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
- struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
- int tcp_header_size = tp->tcp_header_len;
- struct tcphdr *th;
- int sysctl_flags;
- int err;
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_sock *inet;
+ struct tcp_sock *tp;
+ struct tcp_skb_cb *tcb;
+ int tcp_header_size;
+ struct tcphdr *th;
+ int sysctl_flags;
+ int err;
+
+ BUG_ON(!skb || !tcp_skb_pcount(skb));
- BUG_ON(!tcp_skb_pcount(skb));
+ /* If congestion control is doing timestamping, we must
+ * take such a timestamp before we potentially clone/copy.
+ */
+ if (icsk->icsk_ca_ops->rtt_sample)
+ __net_timestamp(skb);
+
+ if (likely(clone_it)) {
+ if (unlikely(skb_cloned(skb)))
+ skb = pskb_copy(skb, gfp_mask);
+ else
+ skb = skb_clone(skb, gfp_mask);
+ if (unlikely(!skb))
+ return -ENOBUFS;
+ }
+
+ inet = inet_sk(sk);
+ tp = tcp_sk(sk);
+ tcb = TCP_SKB_CB(skb);
+ tcp_header_size = tp->tcp_header_len;
#define SYSCTL_FLAG_TSTAMPS 0x1
#define SYSCTL_FLAG_WSCALE 0x2
#define SYSCTL_FLAG_SACK 0x4
- /* If congestion control is doing timestamping */
- if (icsk->icsk_ca_ops->rtt_sample)
- __net_timestamp(skb);
-
- sysctl_flags = 0;
- if (tcb->flags & TCPCB_FLAG_SYN) {
- tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
- if(sysctl_tcp_timestamps) {
- tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
- sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
- }
- if(sysctl_tcp_window_scaling) {
- tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
- sysctl_flags |= SYSCTL_FLAG_WSCALE;
- }
- if(sysctl_tcp_sack) {
- sysctl_flags |= SYSCTL_FLAG_SACK;
- if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
- tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
- }
- } else if (tp->rx_opt.eff_sacks) {
- /* A SACK is 2 pad bytes, a 2 byte header, plus
- * 2 32-bit sequence numbers for each SACK block.
- */
- tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
- (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
+ sysctl_flags = 0;
+ if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
+ tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
+ if(sysctl_tcp_timestamps) {
+ tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
+ sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
}
-
- if (tcp_packets_in_flight(tp) == 0)
- tcp_ca_event(sk, CA_EVENT_TX_START);
-
- th = (struct tcphdr *) skb_push(skb, tcp_header_size);
- skb->h.th = th;
- skb_set_owner_w(skb, sk);
-
- /* Build TCP header and checksum it. */
- th->source = inet->sport;
- th->dest = inet->dport;
- th->seq = htonl(tcb->seq);
- th->ack_seq = htonl(tp->rcv_nxt);
- *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags);
- if (tcb->flags & TCPCB_FLAG_SYN) {
- /* RFC1323: The window in SYN & SYN/ACK segments
- * is never scaled.
- */
- th->window = htons(tp->rcv_wnd);
- } else {
- th->window = htons(tcp_select_window(sk));
+ if (sysctl_tcp_window_scaling) {
+ tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
+ sysctl_flags |= SYSCTL_FLAG_WSCALE;
}
- th->check = 0;
- th->urg_ptr = 0;
-
- if (tp->urg_mode &&
- between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) {
- th->urg_ptr = htons(tp->snd_up-tcb->seq);
- th->urg = 1;
+ if (sysctl_tcp_sack) {
+ sysctl_flags |= SYSCTL_FLAG_SACK;
+ if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
+ tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
}
+ } else if (unlikely(tp->rx_opt.eff_sacks)) {
+ /* A SACK is 2 pad bytes, a 2 byte header, plus
+ * 2 32-bit sequence numbers for each SACK block.
+ */
+ tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
+ (tp->rx_opt.eff_sacks *
+ TCPOLEN_SACK_PERBLOCK));
+ }
+
+ if (tcp_packets_in_flight(tp) == 0)
+ tcp_ca_event(sk, CA_EVENT_TX_START);
+
+ th = (struct tcphdr *) skb_push(skb, tcp_header_size);
+ skb->h.th = th;
+ skb_set_owner_w(skb, sk);
+
+ /* Build TCP header and checksum it. */
+ th->source = inet->sport;
+ th->dest = inet->dport;
+ th->seq = htonl(tcb->seq);
+ th->ack_seq = htonl(tp->rcv_nxt);
+ *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
+ tcb->flags);
+
+ if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
+ /* RFC1323: The window in SYN & SYN/ACK segments
+ * is never scaled.
+ */
+ th->window = htons(tp->rcv_wnd);
+ } else {
+ th->window = htons(tcp_select_window(sk));
+ }
+ th->check = 0;
+ th->urg_ptr = 0;
- if (tcb->flags & TCPCB_FLAG_SYN) {
- tcp_syn_build_options((__u32 *)(th + 1),
- tcp_advertise_mss(sk),
- (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
- (sysctl_flags & SYSCTL_FLAG_SACK),
- (sysctl_flags & SYSCTL_FLAG_WSCALE),
- tp->rx_opt.rcv_wscale,
- tcb->when,
- tp->rx_opt.ts_recent);
- } else {
- tcp_build_and_update_options((__u32 *)(th + 1),
- tp, tcb->when);
+ if (unlikely(tp->urg_mode &&
+ between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF))) {
+ th->urg_ptr = htons(tp->snd_up-tcb->seq);
+ th->urg = 1;
+ }
- TCP_ECN_send(sk, tp, skb, tcp_header_size);
- }
- tp->af_specific->send_check(sk, th, skb->len, skb);
+ if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
+ tcp_syn_build_options((__u32 *)(th + 1),
+ tcp_advertise_mss(sk),
+ (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
+ (sysctl_flags & SYSCTL_FLAG_SACK),
+ (sysctl_flags & SYSCTL_FLAG_WSCALE),
+ tp->rx_opt.rcv_wscale,
+ tcb->when,
+ tp->rx_opt.ts_recent);
+ } else {
+ tcp_build_and_update_options((__u32 *)(th + 1),
+ tp, tcb->when);
+ TCP_ECN_send(sk, tp, skb, tcp_header_size);
+ }
- if (tcb->flags & TCPCB_FLAG_ACK)
- tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
+ tp->af_specific->send_check(sk, th, skb->len, skb);
- if (skb->len != tcp_header_size)
- tcp_event_data_sent(tp, skb, sk);
+ if (likely(tcb->flags & TCPCB_FLAG_ACK))
+ tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
- TCP_INC_STATS(TCP_MIB_OUTSEGS);
+ if (skb->len != tcp_header_size)
+ tcp_event_data_sent(tp, skb, sk);
- err = tp->af_specific->queue_xmit(skb, 0);
- if (err <= 0)
- return err;
+ TCP_INC_STATS(TCP_MIB_OUTSEGS);
- tcp_enter_cwr(sk);
+ err = tp->af_specific->queue_xmit(skb, 0);
+ if (unlikely(err <= 0))
+ return err;
+
+ tcp_enter_cwr(sk);
+
+ /* NET_XMIT_CN is special. It does not guarantee,
+ * that this packet is lost. It tells that device
+ * is about to start to drop packets or already
+ * drops some packets of the same priority and
+ * invokes us to send less aggressively.
+ */
+ return err == NET_XMIT_CN ? 0 : err;
- /* NET_XMIT_CN is special. It does not guarantee,
- * that this packet is lost. It tells that device
- * is about to start to drop packets or already
- * drops some packets of the same priority and
- * invokes us to send less aggressively.
- */
- return err == NET_XMIT_CN ? 0 : err;
- }
- return -ENOBUFS;
#undef SYSCTL_FLAG_TSTAMPS
#undef SYSCTL_FLAG_WSCALE
#undef SYSCTL_FLAG_SACK
@@ -436,6 +453,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss
u16 flags;
BUG_ON(len > skb->len);
+
+ clear_all_retrans_hints(tp);
nsize = skb_headlen(skb) - len;
if (nsize < 0)
nsize = 0;
@@ -599,7 +618,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
for TCP options, but includes only bare TCP header.
tp->rx_opt.mss_clamp is mss negotiated at connection setup.
- It is minumum of user_mss and mss received with SYN.
+ It is minimum of user_mss and mss received with SYN.
It also does not include TCP options.
tp->pmtu_cookie is last pmtu, seen by this function.
@@ -1034,7 +1053,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))))
+ if (unlikely(tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC)))
break;
/* Advance the send_head. This one is sent out.
@@ -1107,7 +1126,7 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
/* Send it out now. */
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) {
+ if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) {
update_send_head(sk, tp, skb);
tcp_cwnd_validate(sk, tp);
return;
@@ -1171,7 +1190,7 @@ u32 __tcp_select_window(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- /* MSS for the peer's data. Previous verions used mss_clamp
+ /* MSS for the peer's data. Previous versions used mss_clamp
* here. I don't know if the value based on our guesses
* of peer's MSS is better for the performance. It's more correct
* but may be worse for the performance because of rcv_mss
@@ -1260,7 +1279,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
BUG_ON(tcp_skb_pcount(skb) != 1 ||
tcp_skb_pcount(next_skb) != 1);
- /* Ok. We will be able to collapse the packet. */
+ /* changing transmit queue under us so clear hints */
+ clear_all_retrans_hints(tp);
+
+ /* Ok. We will be able to collapse the packet. */
__skb_unlink(next_skb, &sk->sk_write_queue);
memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
@@ -1330,6 +1352,8 @@ void tcp_simple_retransmit(struct sock *sk)
}
}
+ clear_all_retrans_hints(tp);
+
if (!lost)
return;
@@ -1361,7 +1385,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
int err;
/* Do not sent more than we queued. 1/4 is reserved for possible
- * copying overhead: frgagmentation, tunneling, mangling etc.
+ * copying overhead: fragmentation, tunneling, mangling etc.
*/
if (atomic_read(&sk->sk_wmem_alloc) >
min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
@@ -1422,9 +1446,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
*/
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
- pskb_copy(skb, GFP_ATOMIC):
- skb_clone(skb, GFP_ATOMIC)));
+ err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
if (err == 0) {
/* Update global TCP statistics. */
@@ -1468,13 +1490,25 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
- int packet_cnt = tp->lost_out;
+ int packet_cnt;
+
+ if (tp->retransmit_skb_hint) {
+ skb = tp->retransmit_skb_hint;
+ packet_cnt = tp->retransmit_cnt_hint;
+ }else{
+ skb = sk->sk_write_queue.next;
+ packet_cnt = 0;
+ }
/* First pass: retransmit lost packets. */
- if (packet_cnt) {
- sk_stream_for_retrans_queue(skb, sk) {
+ if (tp->lost_out) {
+ sk_stream_for_retrans_queue_from(skb, sk) {
__u8 sacked = TCP_SKB_CB(skb)->sacked;
+ /* we could do better than to assign each time */
+ tp->retransmit_skb_hint = skb;
+ tp->retransmit_cnt_hint = packet_cnt;
+
/* Assume this retransmit will generate
* only one packet for congestion window
* calculation purposes. This works because
@@ -1485,10 +1519,12 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
return;
- if (sacked&TCPCB_LOST) {
+ if (sacked & TCPCB_LOST) {
if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
- if (tcp_retransmit_skb(sk, skb))
+ if (tcp_retransmit_skb(sk, skb)) {
+ tp->retransmit_skb_hint = NULL;
return;
+ }
if (icsk->icsk_ca_state != TCP_CA_Loss)
NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
else
@@ -1501,8 +1537,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
TCP_RTO_MAX);
}
- packet_cnt -= tcp_skb_pcount(skb);
- if (packet_cnt <= 0)
+ packet_cnt += tcp_skb_pcount(skb);
+ if (packet_cnt >= tp->lost_out)
break;
}
}
@@ -1528,9 +1564,18 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if (tcp_may_send_now(sk, tp))
return;
- packet_cnt = 0;
+ if (tp->forward_skb_hint) {
+ skb = tp->forward_skb_hint;
+ packet_cnt = tp->forward_cnt_hint;
+ } else{
+ skb = sk->sk_write_queue.next;
+ packet_cnt = 0;
+ }
+
+ sk_stream_for_retrans_queue_from(skb, sk) {
+ tp->forward_cnt_hint = packet_cnt;
+ tp->forward_skb_hint = skb;
- sk_stream_for_retrans_queue(skb, sk) {
/* Similar to the retransmit loop above we
* can pretend that the retransmitted SKB
* we send out here will be composed of one
@@ -1547,8 +1592,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
continue;
/* Ok, retransmit it. */
- if (tcp_retransmit_skb(sk, skb))
+ if (tcp_retransmit_skb(sk, skb)) {
+ tp->forward_skb_hint = NULL;
break;
+ }
if (skb == skb_peek(&sk->sk_write_queue))
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
@@ -1633,7 +1680,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- if (tcp_transmit_skb(sk, skb))
+ if (tcp_transmit_skb(sk, skb, 0, priority))
NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
}
@@ -1668,7 +1715,7 @@ int tcp_send_synack(struct sock *sk)
TCP_ECN_send_synack(tcp_sk(sk), skb);
}
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
+ return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
}
/*
@@ -1829,7 +1876,7 @@ int tcp_connect(struct sock *sk)
__skb_queue_tail(&sk->sk_write_queue, buff);
sk_charge_skb(sk, buff);
tp->packets_out += tcp_skb_pcount(buff);
- tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
+ tcp_transmit_skb(sk, buff, 1, GFP_KERNEL);
TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
/* Timer for repeating the SYN until an answer. */
@@ -1925,7 +1972,7 @@ void tcp_send_ack(struct sock *sk)
/* Send it off, this clears delayed acks for us. */
TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
TCP_SKB_CB(buff)->when = tcp_time_stamp;
- tcp_transmit_skb(sk, buff);
+ tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
}
}
@@ -1965,7 +2012,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1;
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- return tcp_transmit_skb(sk, skb);
+ return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
}
int tcp_write_wakeup(struct sock *sk)
@@ -1998,7 +2045,7 @@ int tcp_write_wakeup(struct sock *sk)
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
+ err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
if (!err) {
update_send_head(sk, tp, skb);
}
@@ -2058,3 +2105,4 @@ EXPORT_SYMBOL(tcp_connect);
EXPORT_SYMBOL(tcp_make_synack);
EXPORT_SYMBOL(tcp_simple_retransmit);
EXPORT_SYMBOL(tcp_sync_mss);
+EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor);
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 327770bf552..26d7486ee50 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -20,20 +20,20 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
u32 in_flight, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (in_flight < tp->snd_cwnd)
+
+ if (!tcp_is_cwnd_limited(sk, in_flight))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
- tp->snd_cwnd++;
- } else {
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp);
+ else {
tp->snd_cwnd_cnt++;
if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){
- tp->snd_cwnd++;
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
tp->snd_cwnd_cnt = 0;
}
}
- tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
- tp->snd_cwnd_stamp = tcp_time_stamp;
}
static u32 tcp_scalable_ssthresh(struct sock *sk)
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 415ee47ac1c..e1880959614 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -58,7 +58,7 @@ static void tcp_write_err(struct sock *sk)
* to prevent DoS attacks. It is called when a retransmission timeout
* or zero probe timeout occurs on orphaned socket.
*
- * Criterium is still not confirmed experimentally and may change.
+ * Criteria is still not confirmed experimentally and may change.
* We kill the socket, if:
* 1. If number of orphaned sockets exceeds an administratively configured
* limit.
@@ -132,7 +132,7 @@ static int tcp_write_timeout(struct sock *sk)
hole detection. :-(
It is place to make it. It is not made. I do not want
- to make it. It is disguisting. It does not work in any
+ to make it. It is disgusting. It does not work in any
case. Let me to cite the same draft, which requires for
us to implement this:
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 93c5f92070f..13e7e6e8df1 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -215,14 +215,6 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
vegas->beg_snd_nxt = tp->snd_nxt;
vegas->beg_snd_cwnd = tp->snd_cwnd;
- /* Take into account the current RTT sample too, to
- * decrease the impact of delayed acks. This double counts
- * this sample since we count it for the next window as well,
- * but that's not too awful, since we're taking the min,
- * rather than averaging.
- */
- tcp_vegas_rtt_calc(sk, seq_rtt * 1000);
-
/* We do the Vegas calculations only if we got enough RTT
* samples that we can be reasonably sure that we got
* at least one RTT sample that wasn't from a delayed ACK.
@@ -236,8 +228,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
/* We don't have enough RTT samples to do the Vegas
* calculation, so we'll behave like Reno.
*/
- if (tp->snd_cwnd > tp->snd_ssthresh)
- tp->snd_cwnd++;
+ tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag);
} else {
u32 rtt, target_cwnd, diff;
@@ -275,7 +266,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
*/
diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
- if (tp->snd_cwnd < tp->snd_ssthresh) {
+ if (tp->snd_cwnd <= tp->snd_ssthresh) {
/* Slow start. */
if (diff > gamma) {
/* Going too fast. Time to slow down
@@ -295,6 +286,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
V_PARAM_SHIFT)+1);
}
+ tcp_slow_start(tp);
} else {
/* Congestion avoidance. */
u32 next_snd_cwnd;
@@ -327,37 +319,17 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
else if (next_snd_cwnd < tp->snd_cwnd)
tp->snd_cwnd--;
}
+
+ if (tp->snd_cwnd < 2)
+ tp->snd_cwnd = 2;
+ else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
+ tp->snd_cwnd = tp->snd_cwnd_clamp;
}
/* Wipe the slate clean for the next RTT. */
vegas->cntRTT = 0;
vegas->minRTT = 0x7fffffff;
}
-
- /* The following code is executed for every ack we receive,
- * except for conditions checked in should_advance_cwnd()
- * before the call to tcp_cong_avoid(). Mainly this means that
- * we only execute this code if the ack actually acked some
- * data.
- */
-
- /* If we are in slow start, increase our cwnd in response to this ACK.
- * (If we are not in slow start then we are in congestion avoidance,
- * and adjust our congestion window only once per RTT. See the code
- * above.)
- */
- if (tp->snd_cwnd <= tp->snd_ssthresh)
- tp->snd_cwnd++;
-
- /* to keep cwnd from growing without bound */
- tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
-
- /* Make sure that we are never so timid as to reduce our cwnd below
- * 2 MSS.
- *
- * Going below 2 MSS would risk huge delayed ACKs from our receiver.
- */
- tp->snd_cwnd = max(tp->snd_cwnd, 2U);
}
/* Extract info for Tcp socket info provided via netlink. */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index e0bd1013cb0..2422a5f7195 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -761,7 +761,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
static __inline__ int __udp_checksum_complete(struct sk_buff *skb)
{
- return (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum));
+ return __skb_checksum_complete(skb);
}
static __inline__ int udp_checksum_complete(struct sk_buff *skb)
@@ -1100,11 +1100,8 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
if (uh->check == 0) {
skb->ip_summed = CHECKSUM_UNNECESSARY;
} else if (skb->ip_summed == CHECKSUM_HW) {
- skb->ip_summed = CHECKSUM_UNNECESSARY;
if (!udp_check(uh, ulen, saddr, daddr, skb->csum))
- return 0;
- LIMIT_NETDEBUG(KERN_DEBUG "udp v4 hw csum failure.\n");
- skb->ip_summed = CHECKSUM_NONE;
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
}
if (skb->ip_summed != CHECKSUM_UNNECESSARY)
skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index b2b60f3e9cd..42196ba3b0b 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -182,6 +182,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl)
case IPPROTO_UDP:
case IPPROTO_TCP:
case IPPROTO_SCTP:
+ case IPPROTO_DCCP:
if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
u16 *ports = (u16 *)xprth;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index ddcf7754eec..a60585fd85a 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -137,6 +137,7 @@ static int addrconf_ifdown(struct net_device *dev, int how);
static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags);
static void addrconf_dad_timer(unsigned long data);
static void addrconf_dad_completed(struct inet6_ifaddr *ifp);
+static void addrconf_dad_run(struct inet6_dev *idev);
static void addrconf_rs_timer(unsigned long data);
static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa);
static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa);
@@ -379,8 +380,8 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev)
dev->type == ARPHRD_NONE ||
dev->type == ARPHRD_SIT) {
printk(KERN_INFO
- "Disabled Privacy Extensions on device %p(%s)\n",
- dev, dev->name);
+ "%s: Disabled Privacy Extensions\n",
+ dev->name);
ndev->cnf.use_tempaddr = -1;
} else {
in6_dev_hold(ndev);
@@ -388,6 +389,9 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev)
}
#endif
+ if (netif_carrier_ok(dev))
+ ndev->if_flags |= IF_READY;
+
write_lock_bh(&addrconf_lock);
dev->ip6_ptr = ndev;
write_unlock_bh(&addrconf_lock);
@@ -415,6 +419,7 @@ static struct inet6_dev * ipv6_find_idev(struct net_device *dev)
if ((idev = ipv6_add_dev(dev)) == NULL)
return NULL;
}
+
if (dev->flags&IFF_UP)
ipv6_mc_up(idev);
return idev;
@@ -634,8 +639,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
}
#endif
- for (ifap = &idev->addr_list; (ifa=*ifap) != NULL;
- ifap = &ifa->if_next) {
+ for (ifap = &idev->addr_list; (ifa=*ifap) != NULL;) {
if (ifa == ifp) {
*ifap = ifa->if_next;
__in6_ifa_put(ifp);
@@ -643,6 +647,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
if (!(ifp->flags & IFA_F_PERMANENT) || onlink > 0)
break;
deleted = 1;
+ continue;
} else if (ifp->flags & IFA_F_PERMANENT) {
if (ipv6_prefix_equal(&ifa->addr, &ifp->addr,
ifp->prefix_len)) {
@@ -666,6 +671,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
}
}
}
+ ifap = &ifa->if_next;
}
write_unlock_bh(&idev->lock);
@@ -903,11 +909,18 @@ int ipv6_dev_get_saddr(struct net_device *daddr_dev,
score.addr_type = __ipv6_addr_type(&ifa->addr);
- /* Rule 0: Candidate Source Address (section 4)
+ /* Rule 0:
+ * - Tentative Address (RFC2462 section 5.4)
+ * - A tentative address is not considered
+ * "assigned to an interface" in the traditional
+ * sense.
+ * - Candidate Source Address (section 4)
* - In any case, anycast addresses, multicast
* addresses, and the unspecified address MUST
* NOT be included in a candidate set.
*/
+ if (ifa->flags & IFA_F_TENTATIVE)
+ continue;
if (unlikely(score.addr_type == IPV6_ADDR_ANY ||
score.addr_type & IPV6_ADDR_MULTICAST)) {
LIMIT_NETDEBUG(KERN_DEBUG
@@ -985,6 +998,8 @@ int ipv6_dev_get_saddr(struct net_device *daddr_dev,
}
/* Rule 4: Prefer home address -- not implemented yet */
+ if (hiscore.rule < 4)
+ hiscore.rule++;
/* Rule 5: Prefer outgoing interface */
if (hiscore.rule < 5) {
@@ -1045,9 +1060,10 @@ int ipv6_dev_get_saddr(struct net_device *daddr_dev,
}
#endif
/* Rule 8: Use longest matching prefix */
- if (hiscore.rule < 8)
+ if (hiscore.rule < 8) {
hiscore.matchlen = ipv6_addr_diff(&ifa_result->addr, daddr);
- score.rule++;
+ hiscore.rule++;
+ }
score.matchlen = ipv6_addr_diff(&ifa->addr, daddr);
if (score.matchlen > hiscore.matchlen) {
score.rule = 8;
@@ -1212,10 +1228,8 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2)
/* Gets referenced address, destroys ifaddr */
-void addrconf_dad_failure(struct inet6_ifaddr *ifp)
+void addrconf_dad_stop(struct inet6_ifaddr *ifp)
{
- if (net_ratelimit())
- printk(KERN_INFO "%s: duplicate address detected!\n", ifp->idev->dev->name);
if (ifp->flags&IFA_F_PERMANENT) {
spin_lock_bh(&ifp->lock);
addrconf_del_timer(ifp);
@@ -1241,6 +1255,12 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp)
ipv6_del_addr(ifp);
}
+void addrconf_dad_failure(struct inet6_ifaddr *ifp)
+{
+ if (net_ratelimit())
+ printk(KERN_INFO "%s: duplicate address detected!\n", ifp->idev->dev->name);
+ addrconf_dad_stop(ifp);
+}
/* Join to solicited addr multicast group. */
@@ -1593,9 +1613,17 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len)
not good.
*/
if (valid_lft >= 0x7FFFFFFF/HZ)
- rt_expires = 0;
+ rt_expires = 0x7FFFFFFF - (0x7FFFFFFF % HZ);
else
- rt_expires = jiffies + valid_lft * HZ;
+ rt_expires = valid_lft * HZ;
+
+ /*
+ * We convert this (in jiffies) to clock_t later.
+ * Avoid arithmetic overflow there as well.
+ * Overflow can happen only if HZ < USER_HZ.
+ */
+ if (HZ < USER_HZ && rt_expires > 0x7FFFFFFF / USER_HZ)
+ rt_expires = 0x7FFFFFFF / USER_HZ;
if (pinfo->onlink) {
struct rt6_info *rt;
@@ -1607,12 +1635,12 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len)
ip6_del_rt(rt, NULL, NULL, NULL);
rt = NULL;
} else {
- rt->rt6i_expires = rt_expires;
+ rt->rt6i_expires = jiffies + rt_expires;
}
}
} else if (valid_lft) {
addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len,
- dev, rt_expires, RTF_ADDRCONF|RTF_EXPIRES|RTF_PREFIX_RT);
+ dev, jiffies_to_clock_t(rt_expires), RTF_ADDRCONF|RTF_EXPIRES|RTF_PREFIX_RT);
}
if (rt)
dst_release(&rt->u.dst);
@@ -2122,9 +2150,42 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
{
struct net_device *dev = (struct net_device *) data;
struct inet6_dev *idev = __in6_dev_get(dev);
+ int run_pending = 0;
switch(event) {
case NETDEV_UP:
+ case NETDEV_CHANGE:
+ if (event == NETDEV_UP) {
+ if (!netif_carrier_ok(dev)) {
+ /* device is not ready yet. */
+ printk(KERN_INFO
+ "ADDRCONF(NETDEV_UP): %s: "
+ "link is not ready\n",
+ dev->name);
+ break;
+ }
+ } else {
+ if (!netif_carrier_ok(dev)) {
+ /* device is still not ready. */
+ break;
+ }
+
+ if (idev) {
+ if (idev->if_flags & IF_READY) {
+ /* device is already configured. */
+ break;
+ }
+ idev->if_flags |= IF_READY;
+ }
+
+ printk(KERN_INFO
+ "ADDRCONF(NETDEV_CHANGE): %s: "
+ "link becomes ready\n",
+ dev->name);
+
+ run_pending = 1;
+ }
+
switch(dev->type) {
case ARPHRD_SIT:
addrconf_sit_config(dev);
@@ -2141,6 +2202,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
break;
};
if (idev) {
+ if (run_pending)
+ addrconf_dad_run(idev);
+
/* If the MTU changed during the interface down, when the
interface up, the changed MTU must be reflected in the
idev as well as routers.
@@ -2175,8 +2239,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
*/
addrconf_ifdown(dev, event != NETDEV_DOWN);
break;
- case NETDEV_CHANGE:
- break;
+
case NETDEV_CHANGENAME:
#ifdef CONFIG_SYSCTL
if (idev) {
@@ -2257,7 +2320,7 @@ static int addrconf_ifdown(struct net_device *dev, int how)
/* Step 3: clear flags for stateless addrconf */
if (how != 1)
- idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD);
+ idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD|IF_READY);
/* Step 4: clear address list */
#ifdef CONFIG_IPV6_PRIVACY
@@ -2366,11 +2429,20 @@ out:
/*
* Duplicate Address Detection
*/
+static void addrconf_dad_kick(struct inet6_ifaddr *ifp)
+{
+ unsigned long rand_num;
+ struct inet6_dev *idev = ifp->idev;
+
+ rand_num = net_random() % (idev->cnf.rtr_solicit_delay ? : 1);
+ ifp->probes = idev->cnf.dad_transmits;
+ addrconf_mod_timer(ifp, AC_DAD, rand_num);
+}
+
static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags)
{
struct inet6_dev *idev = ifp->idev;
struct net_device *dev = idev->dev;
- unsigned long rand_num;
addrconf_join_solict(dev, &ifp->addr);
@@ -2379,7 +2451,6 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags)
flags);
net_srandom(ifp->addr.s6_addr32[3]);
- rand_num = net_random() % (idev->cnf.rtr_solicit_delay ? : 1);
read_lock_bh(&idev->lock);
if (ifp->dead)
@@ -2396,9 +2467,19 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags)
return;
}
- ifp->probes = idev->cnf.dad_transmits;
- addrconf_mod_timer(ifp, AC_DAD, rand_num);
-
+ if (!(idev->if_flags & IF_READY)) {
+ spin_unlock_bh(&ifp->lock);
+ read_unlock_bh(&idev->lock);
+ /*
+ * If the defice is not ready:
+ * - keep it tentative if it is a permanent address.
+ * - otherwise, kill it.
+ */
+ in6_ifa_hold(ifp);
+ addrconf_dad_stop(ifp);
+ return;
+ }
+ addrconf_dad_kick(ifp);
spin_unlock_bh(&ifp->lock);
out:
read_unlock_bh(&idev->lock);
@@ -2481,6 +2562,22 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
}
}
+static void addrconf_dad_run(struct inet6_dev *idev) {
+ struct inet6_ifaddr *ifp;
+
+ read_lock_bh(&idev->lock);
+ for (ifp = idev->addr_list; ifp; ifp = ifp->if_next) {
+ spin_lock_bh(&ifp->lock);
+ if (!(ifp->flags & IFA_F_TENTATIVE)) {
+ spin_unlock_bh(&ifp->lock);
+ continue;
+ }
+ spin_unlock_bh(&ifp->lock);
+ addrconf_dad_kick(ifp);
+ }
+ read_unlock_bh(&idev->lock);
+}
+
#ifdef CONFIG_PROC_FS
struct if6_iter_state {
int bucket;
@@ -2626,7 +2723,7 @@ static void addrconf_verify(unsigned long foo)
for (i=0; i < IN6_ADDR_HSIZE; i++) {
restart:
- write_lock(&addrconf_hash_lock);
+ read_lock(&addrconf_hash_lock);
for (ifp=inet6_addr_lst[i]; ifp; ifp=ifp->lst_next) {
unsigned long age;
#ifdef CONFIG_IPV6_PRIVACY
@@ -2648,7 +2745,7 @@ restart:
if (age >= ifp->valid_lft) {
spin_unlock(&ifp->lock);
in6_ifa_hold(ifp);
- write_unlock(&addrconf_hash_lock);
+ read_unlock(&addrconf_hash_lock);
ipv6_del_addr(ifp);
goto restart;
} else if (age >= ifp->prefered_lft) {
@@ -2667,7 +2764,7 @@ restart:
if (deprecate) {
in6_ifa_hold(ifp);
- write_unlock(&addrconf_hash_lock);
+ read_unlock(&addrconf_hash_lock);
ipv6_ifa_notify(0, ifp);
in6_ifa_put(ifp);
@@ -2685,7 +2782,10 @@ restart:
in6_ifa_hold(ifp);
in6_ifa_hold(ifpub);
spin_unlock(&ifp->lock);
- write_unlock(&addrconf_hash_lock);
+ read_unlock(&addrconf_hash_lock);
+ spin_lock(&ifpub->lock);
+ ifpub->regen_count = 0;
+ spin_unlock(&ifpub->lock);
ipv6_create_tempaddr(ifpub, ifp);
in6_ifa_put(ifpub);
in6_ifa_put(ifp);
@@ -2702,7 +2802,7 @@ restart:
spin_unlock(&ifp->lock);
}
}
- write_unlock(&addrconf_hash_lock);
+ read_unlock(&addrconf_hash_lock);
}
addr_chk_timer.expires = time_before(next, jiffies + HZ) ? jiffies + HZ : next;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 4f8795af2ed..d9546380fa0 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -92,10 +92,13 @@ static int inet6_create(struct socket *sock, int protocol)
struct proto *answer_prot;
unsigned char answer_flags;
char answer_no_check;
- int rc;
+ int try_loading_module = 0;
+ int err;
/* Look for the requested type/protocol pair. */
answer = NULL;
+lookup_protocol:
+ err = -ESOCKTNOSUPPORT;
rcu_read_lock();
list_for_each_rcu(p, &inetsw6[sock->type]) {
answer = list_entry(p, struct inet_protosw, list);
@@ -113,21 +116,37 @@ static int inet6_create(struct socket *sock, int protocol)
if (IPPROTO_IP == answer->protocol)
break;
}
+ err = -EPROTONOSUPPORT;
answer = NULL;
}
- rc = -ESOCKTNOSUPPORT;
- if (!answer)
- goto out_rcu_unlock;
- rc = -EPERM;
+ if (!answer) {
+ if (try_loading_module < 2) {
+ rcu_read_unlock();
+ /*
+ * Be more specific, e.g. net-pf-10-proto-132-type-1
+ * (net-pf-PF_INET6-proto-IPPROTO_SCTP-type-SOCK_STREAM)
+ */
+ if (++try_loading_module == 1)
+ request_module("net-pf-%d-proto-%d-type-%d",
+ PF_INET6, protocol, sock->type);
+ /*
+ * Fall back to generic, e.g. net-pf-10-proto-132
+ * (net-pf-PF_INET6-proto-IPPROTO_SCTP)
+ */
+ else
+ request_module("net-pf-%d-proto-%d",
+ PF_INET6, protocol);
+ goto lookup_protocol;
+ } else
+ goto out_rcu_unlock;
+ }
+
+ err = -EPERM;
if (answer->capability > 0 && !capable(answer->capability))
goto out_rcu_unlock;
- rc = -EPROTONOSUPPORT;
- if (!protocol)
- goto out_rcu_unlock;
sock->ops = answer->ops;
-
answer_prot = answer->prot;
answer_no_check = answer->no_check;
answer_flags = answer->flags;
@@ -135,14 +154,14 @@ static int inet6_create(struct socket *sock, int protocol)
BUG_TRAP(answer_prot->slab != NULL);
- rc = -ENOBUFS;
+ err = -ENOBUFS;
sk = sk_alloc(PF_INET6, GFP_KERNEL, answer_prot, 1);
if (sk == NULL)
goto out;
sock_init_data(sock, sk);
- rc = 0;
+ err = 0;
sk->sk_no_check = answer_no_check;
if (INET_PROTOSW_REUSE & answer_flags)
sk->sk_reuse = 1;
@@ -202,14 +221,14 @@ static int inet6_create(struct socket *sock, int protocol)
sk->sk_prot->hash(sk);
}
if (sk->sk_prot->init) {
- rc = sk->sk_prot->init(sk);
- if (rc) {
+ err = sk->sk_prot->init(sk);
+ if (err) {
sk_common_release(sk);
goto out;
}
}
out:
- return rc;
+ return err;
out_rcu_unlock:
rcu_read_unlock();
goto out;
@@ -699,12 +718,14 @@ static int __init inet6_init(void)
/* Register the family here so that the init calls below will
* be able to create sockets. (?? is this dangerous ??)
*/
- (void) sock_register(&inet6_family_ops);
+ err = sock_register(&inet6_family_ops);
+ if (err)
+ goto out_unregister_raw_proto;
/* Initialise ipv6 mibs */
err = init_ipv6_mibs();
if (err)
- goto out_unregister_raw_proto;
+ goto out_unregister_sock;
/*
* ipngwg API draft makes clear that the correct semantics
@@ -796,6 +817,8 @@ icmp_fail:
ipv6_sysctl_unregister();
#endif
cleanup_ipv6_mibs();
+out_unregister_sock:
+ sock_unregister(PF_INET6);
out_unregister_raw_proto:
proto_unregister(&rawv6_prot);
out_unregister_udp_proto:
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index cc518405b3e..c4a3a993acb 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -437,7 +437,7 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb)
break;
case IPPROTO_AH:
nexthdr = ptr[0];
- len = (ptr[1] + 1) << 2;
+ len = (ptr[1] + 2) << 2;
break;
default:
nexthdr = ptr[0];
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 40d9a1935ab..8bfbe997079 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -248,7 +248,7 @@ static u32 esp6_get_max_size(struct xfrm_state *x, int mtu)
if (esp->conf.padlen)
mtu = ALIGN(mtu, esp->conf.padlen);
- return mtu + x->props.header_len + esp->auth.icv_full_len;
+ return mtu + x->props.header_len + esp->auth.icv_trunc_len;
}
static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 922549581ab..be6faf31138 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -628,6 +628,7 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
if (!tot_len)
return NULL;
+ tot_len += sizeof(*opt2);
opt2 = sock_kmalloc(sk, tot_len, GFP_ATOMIC);
if (!opt2)
return ERR_PTR(-ENOBUFS);
@@ -668,7 +669,26 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
return opt2;
out:
- sock_kfree_s(sk, p, tot_len);
+ sock_kfree_s(sk, opt2, opt2->tot_len);
return ERR_PTR(err);
}
+struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
+ struct ipv6_txoptions *opt)
+{
+ /*
+ * ignore the dest before srcrt unless srcrt is being included.
+ * --yoshfuji
+ */
+ if (opt && opt->dst0opt && !opt->srcrt) {
+ if (opt_space != opt) {
+ memcpy(opt_space, opt, sizeof(*opt_space));
+ opt = opt_space;
+ }
+ opt->opt_nflen -= ipv6_optlen(opt->dst0opt);
+ opt->dst0opt = NULL;
+ }
+
+ return opt;
+}
+
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 23e540365a1..6ec6a2b549b 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -328,8 +328,10 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
iif = skb->dev->ifindex;
/*
- * Must not send if we know that source is Anycast also.
- * for now we don't know that.
+ * Must not send error if the source does not uniquely
+ * identify a single node (RFC2463 Section 2.4).
+ * We check unspecified / multicast addresses here,
+ * and anycast addresses will be checked later.
*/
if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) {
LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: addr_any/mcast source\n");
@@ -373,6 +375,16 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
err = ip6_dst_lookup(sk, &dst, &fl);
if (err)
goto out;
+
+ /*
+ * We won't send icmp if the destination is known
+ * anycast.
+ */
+ if (((struct rt6_info *)dst)->rt6i_flags & RTF_ANYCAST) {
+ LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: acast source\n");
+ goto out_dst_release;
+ }
+
if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0)
goto out;
@@ -585,17 +597,16 @@ static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
daddr = &skb->nh.ipv6h->daddr;
/* Perform checksum. */
- if (skb->ip_summed == CHECKSUM_HW) {
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6,
- skb->csum)) {
- LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 hw checksum failed\n");
- skb->ip_summed = CHECKSUM_NONE;
- }
- }
- if (skb->ip_summed == CHECKSUM_NONE) {
- if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6,
- skb_checksum(skb, 0, skb->len, 0))) {
+ switch (skb->ip_summed) {
+ case CHECKSUM_HW:
+ if (!csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6,
+ skb->csum))
+ break;
+ /* fall through */
+ case CHECKSUM_NONE:
+ skb->csum = ~csum_ipv6_magic(saddr, daddr, skb->len,
+ IPPROTO_ICMPV6, 0);
+ if (__skb_checksum_complete(skb)) {
LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n",
NIP6(*saddr), NIP6(*daddr));
goto discard_it;
@@ -752,7 +763,7 @@ void icmpv6_cleanup(void)
inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6);
}
-static struct icmp6_err {
+static const struct icmp6_err {
int err;
int fatal;
} tab_unreach[] = {
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index bbbe80cdaf7..1cf02765fb5 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -225,20 +225,16 @@ struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions * opt_space,
struct ip6_flowlabel * fl,
struct ipv6_txoptions * fopt)
{
- struct ipv6_txoptions * fl_opt = fl ? fl->opt : NULL;
-
- if (fopt == NULL || fopt->opt_flen == 0) {
- if (!fl_opt || !fl_opt->dst0opt || fl_opt->srcrt)
- return fl_opt;
- }
-
+ struct ipv6_txoptions * fl_opt = fl->opt;
+
+ if (fopt == NULL || fopt->opt_flen == 0)
+ return fl_opt;
+
if (fl_opt != NULL) {
opt_space->hopopt = fl_opt->hopopt;
- opt_space->dst0opt = fl_opt->srcrt ? fl_opt->dst0opt : NULL;
+ opt_space->dst0opt = fl_opt->dst0opt;
opt_space->srcrt = fl_opt->srcrt;
opt_space->opt_nflen = fl_opt->opt_nflen;
- if (fl_opt->dst0opt && !fl_opt->srcrt)
- opt_space->opt_nflen -= ipv6_optlen(fl_opt->dst0opt);
} else {
if (fopt->opt_nflen == 0)
return fopt;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index c1fa693511a..8523c76ebf7 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -774,7 +774,8 @@ out_err_release:
*dst = NULL;
return err;
}
-inline int ip6_ufo_append_data(struct sock *sk,
+
+static inline int ip6_ufo_append_data(struct sock *sk,
int getfrag(void *from, char *to, int offset, int len,
int odd, struct sk_buff *skb),
void *from, int length, int hh_len, int fragheaderlen,
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 003fd99ff59..3620718defe 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -287,7 +287,7 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname,
{
struct ipv6_txoptions *opt;
if (optlen == 0)
- optval = 0;
+ optval = NULL;
/* hop-by-hop / destination options are privileged option */
retv = -EPERM;
@@ -628,8 +628,8 @@ e_inval:
return -EINVAL;
}
-int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_opt_hdr *hdr,
- char __user *optval, int len)
+static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_opt_hdr *hdr,
+ char __user *optval, int len)
{
if (!hdr)
return 0;
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index f15e04ad026..f829a4ad3cc 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -170,7 +170,7 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml,
#define MLDV2_QQIC(value) MLDV2_EXP(0x80, 4, 3, value)
#define MLDV2_MRC(value) MLDV2_EXP(0x8000, 12, 3, value)
-#define IPV6_MLD_MAX_MSF 10
+#define IPV6_MLD_MAX_MSF 64
int sysctl_mld_max_msf = IPV6_MLD_MAX_MSF;
@@ -224,6 +224,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr)
mc_lst->ifindex = dev->ifindex;
mc_lst->sfmode = MCAST_EXCLUDE;
+ mc_lst->sflock = RW_LOCK_UNLOCKED;
mc_lst->sflist = NULL;
/*
@@ -360,6 +361,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
struct ip6_sf_socklist *psl;
int i, j, rv;
int leavegroup = 0;
+ int pmclocked = 0;
int err;
if (pgsr->gsr_group.ss_family != AF_INET6 ||
@@ -403,6 +405,9 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
pmc->sfmode = omode;
}
+ write_lock_bh(&pmc->sflock);
+ pmclocked = 1;
+
psl = pmc->sflist;
if (!add) {
if (!psl)
@@ -475,6 +480,8 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
/* update the interface list */
ip6_mc_add_src(idev, group, omode, 1, source, 1);
done:
+ if (pmclocked)
+ write_unlock_bh(&pmc->sflock);
read_unlock_bh(&ipv6_sk_mc_lock);
read_unlock_bh(&idev->lock);
in6_dev_put(idev);
@@ -510,6 +517,8 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
dev = idev->dev;
err = 0;
+ read_lock_bh(&ipv6_sk_mc_lock);
+
if (gsf->gf_fmode == MCAST_INCLUDE && gsf->gf_numsrc == 0) {
leavegroup = 1;
goto done;
@@ -549,6 +558,8 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
newpsl = NULL;
(void) ip6_mc_add_src(idev, group, gsf->gf_fmode, 0, NULL, 0);
}
+
+ write_lock_bh(&pmc->sflock);
psl = pmc->sflist;
if (psl) {
(void) ip6_mc_del_src(idev, group, pmc->sfmode,
@@ -558,8 +569,10 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
(void) ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0);
pmc->sflist = newpsl;
pmc->sfmode = gsf->gf_fmode;
+ write_unlock_bh(&pmc->sflock);
err = 0;
done:
+ read_unlock_bh(&ipv6_sk_mc_lock);
read_unlock_bh(&idev->lock);
in6_dev_put(idev);
dev_put(dev);
@@ -592,6 +605,11 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
dev = idev->dev;
err = -EADDRNOTAVAIL;
+ /*
+ * changes to the ipv6_mc_list require the socket lock and
+ * a read lock on ip6_sk_mc_lock. We have the socket lock,
+ * so reading the list is safe.
+ */
for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) {
if (pmc->ifindex != gsf->gf_interface)
@@ -614,6 +632,10 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) {
return -EFAULT;
}
+ /* changes to psl require the socket lock, a read lock on
+ * on ipv6_sk_mc_lock and a write lock on pmc->sflock. We
+ * have the socket lock, so reading here is safe.
+ */
for (i=0; i<copycount; i++) {
struct sockaddr_in6 *psin6;
struct sockaddr_storage ss;
@@ -650,6 +672,7 @@ int inet6_mc_check(struct sock *sk, struct in6_addr *mc_addr,
read_unlock(&ipv6_sk_mc_lock);
return 1;
}
+ read_lock(&mc->sflock);
psl = mc->sflist;
if (!psl) {
rv = mc->sfmode == MCAST_EXCLUDE;
@@ -665,6 +688,7 @@ int inet6_mc_check(struct sock *sk, struct in6_addr *mc_addr,
if (mc->sfmode == MCAST_EXCLUDE && i < psl->sl_count)
rv = 0;
}
+ read_unlock(&mc->sflock);
read_unlock(&ipv6_sk_mc_lock);
return rv;
@@ -1068,23 +1092,64 @@ static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime)
ma->mca_flags |= MAF_TIMER_RUNNING;
}
-static void mld_marksources(struct ifmcaddr6 *pmc, int nsrcs,
+/* mark EXCLUDE-mode sources */
+static int mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs,
+ struct in6_addr *srcs)
+{
+ struct ip6_sf_list *psf;
+ int i, scount;
+
+ scount = 0;
+ for (psf=pmc->mca_sources; psf; psf=psf->sf_next) {
+ if (scount == nsrcs)
+ break;
+ for (i=0; i<nsrcs; i++) {
+ /* skip inactive filters */
+ if (pmc->mca_sfcount[MCAST_INCLUDE] ||
+ pmc->mca_sfcount[MCAST_EXCLUDE] !=
+ psf->sf_count[MCAST_EXCLUDE])
+ continue;
+ if (ipv6_addr_equal(&srcs[i], &psf->sf_addr)) {
+ scount++;
+ break;
+ }
+ }
+ }
+ pmc->mca_flags &= ~MAF_GSQUERY;
+ if (scount == nsrcs) /* all sources excluded */
+ return 0;
+ return 1;
+}
+
+static int mld_marksources(struct ifmcaddr6 *pmc, int nsrcs,
struct in6_addr *srcs)
{
struct ip6_sf_list *psf;
int i, scount;
+ if (pmc->mca_sfmode == MCAST_EXCLUDE)
+ return mld_xmarksources(pmc, nsrcs, srcs);
+
+ /* mark INCLUDE-mode sources */
+
scount = 0;
for (psf=pmc->mca_sources; psf; psf=psf->sf_next) {
if (scount == nsrcs)
break;
- for (i=0; i<nsrcs; i++)
+ for (i=0; i<nsrcs; i++) {
if (ipv6_addr_equal(&srcs[i], &psf->sf_addr)) {
psf->sf_gsresp = 1;
scount++;
break;
}
+ }
+ }
+ if (!scount) {
+ pmc->mca_flags &= ~MAF_GSQUERY;
+ return 0;
}
+ pmc->mca_flags |= MAF_GSQUERY;
+ return 1;
}
int igmp6_event_query(struct sk_buff *skb)
@@ -1167,7 +1232,7 @@ int igmp6_event_query(struct sk_buff *skb)
/* mark sources to include, if group & source-specific */
if (mlh2->nsrcs != 0) {
if (!pskb_may_pull(skb, srcs_offset +
- mlh2->nsrcs * sizeof(struct in6_addr))) {
+ ntohs(mlh2->nsrcs) * sizeof(struct in6_addr))) {
in6_dev_put(idev);
return -EINVAL;
}
@@ -1203,10 +1268,9 @@ int igmp6_event_query(struct sk_buff *skb)
else
ma->mca_flags &= ~MAF_GSQUERY;
}
- if (ma->mca_flags & MAF_GSQUERY)
- mld_marksources(ma, ntohs(mlh2->nsrcs),
- mlh2->srcs);
- igmp6_group_queried(ma, max_delay);
+ if (!(ma->mca_flags & MAF_GSQUERY) ||
+ mld_marksources(ma, ntohs(mlh2->nsrcs), mlh2->srcs))
+ igmp6_group_queried(ma, max_delay);
spin_unlock_bh(&ma->mca_lock);
if (group_type != IPV6_ADDR_ANY)
break;
@@ -1231,6 +1295,11 @@ int igmp6_event_report(struct sk_buff *skb)
if (skb->pkt_type == PACKET_LOOPBACK)
return 0;
+ /* send our report if the MC router may not have heard this report */
+ if (skb->pkt_type != PACKET_MULTICAST &&
+ skb->pkt_type != PACKET_BROADCAST)
+ return 0;
+
if (!pskb_may_pull(skb, sizeof(struct in6_addr)))
return -EINVAL;
@@ -1276,7 +1345,18 @@ static int is_in(struct ifmcaddr6 *pmc, struct ip6_sf_list *psf, int type,
case MLD2_MODE_IS_EXCLUDE:
if (gdeleted || sdeleted)
return 0;
- return !((pmc->mca_flags & MAF_GSQUERY) && !psf->sf_gsresp);
+ if (!((pmc->mca_flags & MAF_GSQUERY) && !psf->sf_gsresp)) {
+ if (pmc->mca_sfmode == MCAST_INCLUDE)
+ return 1;
+ /* don't include if this source is excluded
+ * in all filters
+ */
+ if (psf->sf_count[MCAST_INCLUDE])
+ return 0;
+ return pmc->mca_sfcount[MCAST_EXCLUDE] ==
+ psf->sf_count[MCAST_EXCLUDE];
+ }
+ return 0;
case MLD2_CHANGE_TO_INCLUDE:
if (gdeleted || sdeleted)
return 0;
@@ -1445,7 +1525,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
struct mld2_report *pmr;
struct mld2_grec *pgr = NULL;
struct ip6_sf_list *psf, *psf_next, *psf_prev, **psf_list;
- int scount, first, isquery, truncate;
+ int scount, stotal, first, isquery, truncate;
if (pmc->mca_flags & MAF_NOREPORT)
return skb;
@@ -1455,25 +1535,13 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
truncate = type == MLD2_MODE_IS_EXCLUDE ||
type == MLD2_CHANGE_TO_EXCLUDE;
+ stotal = scount = 0;
+
psf_list = sdeleted ? &pmc->mca_tomb : &pmc->mca_sources;
- if (!*psf_list) {
- if (type == MLD2_ALLOW_NEW_SOURCES ||
- type == MLD2_BLOCK_OLD_SOURCES)
- return skb;
- if (pmc->mca_crcount || isquery) {
- /* make sure we have room for group header and at
- * least one source.
- */
- if (skb && AVAILABLE(skb) < sizeof(struct mld2_grec)+
- sizeof(struct in6_addr)) {
- mld_sendpack(skb);
- skb = NULL; /* add_grhead will get a new one */
- }
- skb = add_grhead(skb, pmc, type, &pgr);
- }
- return skb;
- }
+ if (!*psf_list)
+ goto empty_source;
+
pmr = skb ? (struct mld2_report *)skb->h.raw : NULL;
/* EX and TO_EX get a fresh packet, if needed */
@@ -1486,7 +1554,6 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
}
}
first = 1;
- scount = 0;
psf_prev = NULL;
for (psf=*psf_list; psf; psf=psf_next) {
struct in6_addr *psrc;
@@ -1520,7 +1587,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
}
psrc = (struct in6_addr *)skb_put(skb, sizeof(*psrc));
*psrc = psf->sf_addr;
- scount++;
+ scount++; stotal++;
if ((type == MLD2_ALLOW_NEW_SOURCES ||
type == MLD2_BLOCK_OLD_SOURCES) && psf->sf_crcount) {
psf->sf_crcount--;
@@ -1535,6 +1602,21 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
}
psf_prev = psf;
}
+
+empty_source:
+ if (!stotal) {
+ if (type == MLD2_ALLOW_NEW_SOURCES ||
+ type == MLD2_BLOCK_OLD_SOURCES)
+ return skb;
+ if (pmc->mca_crcount || isquery) {
+ /* make sure we have room for group header */
+ if (skb && AVAILABLE(skb) < sizeof(struct mld2_grec)) {
+ mld_sendpack(skb);
+ skb = NULL; /* add_grhead will get a new one */
+ }
+ skb = add_grhead(skb, pmc, type, &pgr);
+ }
+ }
if (pgr)
pgr->grec_nsrcs = htons(scount);
@@ -1616,11 +1698,11 @@ static void mld_send_cr(struct inet6_dev *idev)
skb = add_grec(skb, pmc, dtype, 1, 1);
}
if (pmc->mca_crcount) {
- pmc->mca_crcount--;
if (pmc->mca_sfmode == MCAST_EXCLUDE) {
type = MLD2_CHANGE_TO_INCLUDE;
skb = add_grec(skb, pmc, type, 1, 0);
}
+ pmc->mca_crcount--;
if (pmc->mca_crcount == 0) {
mld_clear_zeros(&pmc->mca_tomb);
mld_clear_zeros(&pmc->mca_sources);
@@ -1654,12 +1736,12 @@ static void mld_send_cr(struct inet6_dev *idev)
/* filter mode changes */
if (pmc->mca_crcount) {
- pmc->mca_crcount--;
if (pmc->mca_sfmode == MCAST_EXCLUDE)
type = MLD2_CHANGE_TO_EXCLUDE;
else
type = MLD2_CHANGE_TO_INCLUDE;
skb = add_grec(skb, pmc, type, 0, 0);
+ pmc->mca_crcount--;
}
spin_unlock_bh(&pmc->mca_lock);
}
@@ -2018,6 +2100,9 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml,
{
int err;
+ /* callers have the socket lock and a write lock on ipv6_sk_mc_lock,
+ * so no other readers or writers of iml or its sflist
+ */
if (iml->sflist == 0) {
/* any-source empty exclude case */
return ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0);
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 971ba60bf6e..04912f9b35c 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -5,10 +5,20 @@
menu "IPv6: Netfilter Configuration (EXPERIMENTAL)"
depends on INET && IPV6 && NETFILTER && EXPERIMENTAL
-#tristate 'Connection tracking (required for masq/NAT)' CONFIG_IP6_NF_CONNTRACK
-#if [ "$CONFIG_IP6_NF_CONNTRACK" != "n" ]; then
-# dep_tristate ' FTP protocol support' CONFIG_IP6_NF_FTP $CONFIG_IP6_NF_CONNTRACK
-#fi
+config NF_CONNTRACK_IPV6
+ tristate "IPv6 support for new connection tracking (EXPERIMENTAL)"
+ depends on EXPERIMENTAL && NF_CONNTRACK
+ ---help---
+ Connection tracking keeps a record of what packets have passed
+ through your machine, in order to figure out how they are related
+ into connections.
+
+ This is IPv6 support on Layer 3 independent connection tracking.
+ Layer 3 independent connection tracking is experimental scheme
+ which generalize ip_conntrack to support other layer 3 protocols.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config IP6_NF_QUEUE
tristate "IP6 Userspace queueing via NETLINK (OBSOLETE)"
---help---
@@ -114,7 +124,6 @@ config IP6_NF_MATCH_OWNER
To compile it as a module, choose M here. If unsure, say N.
-# dep_tristate ' MAC address match support' CONFIG_IP6_NF_MATCH_MAC $CONFIG_IP6_NF_IPTABLES
config IP6_NF_MATCH_MARK
tristate "netfilter MARK match support"
depends on IP6_NF_IPTABLES
@@ -170,15 +179,6 @@ config IP6_NF_MATCH_PHYSDEV
To compile it as a module, choose M here. If unsure, say N.
-# dep_tristate ' Multiple port match support' CONFIG_IP6_NF_MATCH_MULTIPORT $CONFIG_IP6_NF_IPTABLES
-# dep_tristate ' TOS match support' CONFIG_IP6_NF_MATCH_TOS $CONFIG_IP6_NF_IPTABLES
-# if [ "$CONFIG_IP6_NF_CONNTRACK" != "n" ]; then
-# dep_tristate ' Connection state match support' CONFIG_IP6_NF_MATCH_STATE $CONFIG_IP6_NF_CONNTRACK $CONFIG_IP6_NF_IPTABLES
-# fi
-# if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
-# dep_tristate ' Unclean match support (EXPERIMENTAL)' CONFIG_IP6_NF_MATCH_UNCLEAN $CONFIG_IP6_NF_IPTABLES
-# dep_tristate ' Owner match support (EXPERIMENTAL)' CONFIG_IP6_NF_MATCH_OWNER $CONFIG_IP6_NF_IPTABLES
-# fi
# The targets
config IP6_NF_FILTER
tristate "Packet filtering"
@@ -211,7 +211,7 @@ config IP6_NF_TARGET_REJECT
config IP6_NF_TARGET_NFQUEUE
tristate "NFQUEUE Target Support"
- depends on IP_NF_IPTABLES
+ depends on IP6_NF_IPTABLES
help
This Target replaced the old obsolete QUEUE target.
@@ -220,12 +220,6 @@ config IP6_NF_TARGET_NFQUEUE
To compile it as a module, choose M here. If unsure, say N.
-# if [ "$CONFIG_IP6_NF_FILTER" != "n" ]; then
-# dep_tristate ' REJECT target support' CONFIG_IP6_NF_TARGET_REJECT $CONFIG_IP6_NF_FILTER
-# if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
-# dep_tristate ' MIRROR target support (EXPERIMENTAL)' CONFIG_IP6_NF_TARGET_MIRROR $CONFIG_IP6_NF_FILTER
-# fi
-# fi
config IP6_NF_MANGLE
tristate "Packet mangling"
depends on IP6_NF_IPTABLES
@@ -236,7 +230,6 @@ config IP6_NF_MANGLE
To compile it as a module, choose M here. If unsure, say N.
-# dep_tristate ' TOS target support' CONFIG_IP6_NF_TARGET_TOS $CONFIG_IP_NF_MANGLE
config IP6_NF_TARGET_MARK
tristate "MARK target support"
depends on IP6_NF_MANGLE
@@ -266,7 +259,6 @@ config IP6_NF_TARGET_HL
To compile it as a module, choose M here. If unsure, say N.
-#dep_tristate ' LOG target support' CONFIG_IP6_NF_TARGET_LOG $CONFIG_IP6_NF_IPTABLES
config IP6_NF_RAW
tristate 'raw table support (required for TRACE)'
depends on IP6_NF_IPTABLES
@@ -278,19 +270,5 @@ config IP6_NF_RAW
If you want to compile it as a module, say M here and read
<file:Documentation/modules.txt>. If unsure, say `N'.
-config NF_CONNTRACK_IPV6
- tristate "IPv6 support for new connection tracking (EXPERIMENTAL)"
- depends on EXPERIMENTAL && NF_CONNTRACK
- ---help---
- Connection tracking keeps a record of what packets have passed
- through your machine, in order to figure out how they are related
- into connections.
-
- This is IPv6 support on Layer 3 independent connection tracking.
- Layer 3 independent connection tracking is experimental scheme
- which generalize ip_conntrack to support other layer 3 protocols.
-
- To compile it as a module, choose M here. If unsure, say N.
-
endmenu
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 7d492226c16..95d469271c4 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1972,7 +1972,7 @@ static int ip6t_get_matches(char *buffer, char **start, off_t offset, int length
return pos;
}
-static struct { char *name; get_info_t *get_info; } ip6t_proc_entry[] =
+static const struct { char *name; get_info_t *get_info; } ip6t_proc_entry[] =
{ { "ip6_tables_names", ip6t_get_tables },
{ "ip6_tables_targets", ip6t_get_targets },
{ "ip6_tables_matches", ip6t_get_matches },
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index e2c90b3a807..753a3ae8502 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -339,8 +339,8 @@ extern unsigned long nf_ct_icmpv6_timeout;
/* From nf_conntrack_frag6.c */
extern unsigned long nf_ct_frag6_timeout;
-extern unsigned long nf_ct_frag6_low_thresh;
-extern unsigned long nf_ct_frag6_high_thresh;
+extern unsigned int nf_ct_frag6_low_thresh;
+extern unsigned int nf_ct_frag6_high_thresh;
static struct ctl_table_header *nf_ct_ipv6_sysctl_header;
@@ -367,7 +367,7 @@ static ctl_table nf_ct_sysctl_table[] = {
.data = &nf_ct_frag6_low_thresh,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
+ .proc_handler = &proc_dointvec,
},
{
.ctl_name = NET_NF_CONNTRACK_FRAG6_HIGH_THRESH,
@@ -375,7 +375,7 @@ static ctl_table nf_ct_sysctl_table[] = {
.data = &nf_ct_frag6_high_thresh,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
+ .proc_handler = &proc_dointvec,
},
{ .ctl_name = 0 }
};
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index c0f1da5497a..a7e03cfacd0 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -68,8 +68,8 @@ static int icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple,
[ICMPV6_NI_REPLY - 128] = ICMPV6_NI_REPLY +1
};
- __u8 type = orig->dst.u.icmp.type - 128;
- if (type >= sizeof(invmap) || !invmap[type])
+ int type = orig->dst.u.icmp.type - 128;
+ if (type < 0 || type >= sizeof(invmap) || !invmap[type])
return 0;
tuple->src.u.icmp.id = orig->src.u.icmp.id;
@@ -129,12 +129,12 @@ static int icmpv6_new(struct nf_conn *conntrack,
[ICMPV6_ECHO_REQUEST - 128] = 1,
[ICMPV6_NI_QUERY - 128] = 1
};
+ int type = conntrack->tuplehash[0].tuple.dst.u.icmp.type - 128;
- if (conntrack->tuplehash[0].tuple.dst.u.icmp.type - 128 >= sizeof(valid_new)
- || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type - 128]) {
+ if (type < 0 || type >= sizeof(valid_new) || !valid_new[type]) {
/* Can't create a new ICMPv6 `conn' with this. */
- DEBUGP("icmp: can't create new conn with type %u\n",
- conntrack->tuplehash[0].tuple.dst.u.icmp.type);
+ DEBUGP("icmpv6: can't create new conn with type %u\n",
+ type + 128);
NF_CT_DUMP_TUPLE(&conntrack->tuplehash[0].tuple);
return 0;
}
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 7640b9bb769..c2c52af9e56 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -55,9 +55,9 @@
#define NF_CT_FRAG6_LOW_THRESH 196608 /* == 192*1024 */
#define NF_CT_FRAG6_TIMEOUT IPV6_FRAG_TIMEOUT
-int nf_ct_frag6_high_thresh = 256*1024;
-int nf_ct_frag6_low_thresh = 192*1024;
-int nf_ct_frag6_timeout = IPV6_FRAG_TIMEOUT;
+unsigned int nf_ct_frag6_high_thresh = 256*1024;
+unsigned int nf_ct_frag6_low_thresh = 192*1024;
+unsigned long nf_ct_frag6_timeout = IPV6_FRAG_TIMEOUT;
struct nf_ct_frag6_skb_cb
{
@@ -190,8 +190,10 @@ static void nf_ct_frag6_secret_rebuild(unsigned long dummy)
atomic_t nf_ct_frag6_mem = ATOMIC_INIT(0);
/* Memory Tracking Functions. */
-static inline void frag_kfree_skb(struct sk_buff *skb)
+static inline void frag_kfree_skb(struct sk_buff *skb, unsigned int *work)
{
+ if (work)
+ *work -= skb->truesize;
atomic_sub(skb->truesize, &nf_ct_frag6_mem);
if (NFCT_FRAG6_CB(skb)->orig)
kfree_skb(NFCT_FRAG6_CB(skb)->orig);
@@ -199,8 +201,11 @@ static inline void frag_kfree_skb(struct sk_buff *skb)
kfree_skb(skb);
}
-static inline void frag_free_queue(struct nf_ct_frag6_queue *fq)
+static inline void frag_free_queue(struct nf_ct_frag6_queue *fq,
+ unsigned int *work)
{
+ if (work)
+ *work -= sizeof(struct nf_ct_frag6_queue);
atomic_sub(sizeof(struct nf_ct_frag6_queue), &nf_ct_frag6_mem);
kfree(fq);
}
@@ -218,7 +223,8 @@ static inline struct nf_ct_frag6_queue *frag_alloc_queue(void)
/* Destruction primitives. */
/* Complete destruction of fq. */
-static void nf_ct_frag6_destroy(struct nf_ct_frag6_queue *fq)
+static void nf_ct_frag6_destroy(struct nf_ct_frag6_queue *fq,
+ unsigned int *work)
{
struct sk_buff *fp;
@@ -230,17 +236,17 @@ static void nf_ct_frag6_destroy(struct nf_ct_frag6_queue *fq)
while (fp) {
struct sk_buff *xp = fp->next;
- frag_kfree_skb(fp);
+ frag_kfree_skb(fp, work);
fp = xp;
}
- frag_free_queue(fq);
+ frag_free_queue(fq, work);
}
-static __inline__ void fq_put(struct nf_ct_frag6_queue *fq)
+static __inline__ void fq_put(struct nf_ct_frag6_queue *fq, unsigned int *work)
{
if (atomic_dec_and_test(&fq->refcnt))
- nf_ct_frag6_destroy(fq);
+ nf_ct_frag6_destroy(fq, work);
}
/* Kill fq entry. It is not destroyed immediately,
@@ -262,16 +268,21 @@ static void nf_ct_frag6_evictor(void)
{
struct nf_ct_frag6_queue *fq;
struct list_head *tmp;
+ unsigned int work;
- for (;;) {
- if (atomic_read(&nf_ct_frag6_mem) <= nf_ct_frag6_low_thresh)
- return;
+ work = atomic_read(&nf_ct_frag6_mem);
+ if (work <= nf_ct_frag6_low_thresh)
+ return;
+
+ work -= nf_ct_frag6_low_thresh;
+ while (work > 0) {
read_lock(&nf_ct_frag6_lock);
if (list_empty(&nf_ct_frag6_lru_list)) {
read_unlock(&nf_ct_frag6_lock);
return;
}
tmp = nf_ct_frag6_lru_list.next;
+ BUG_ON(tmp == NULL);
fq = list_entry(tmp, struct nf_ct_frag6_queue, lru_list);
atomic_inc(&fq->refcnt);
read_unlock(&nf_ct_frag6_lock);
@@ -281,7 +292,7 @@ static void nf_ct_frag6_evictor(void)
fq_kill(fq);
spin_unlock(&fq->lock);
- fq_put(fq);
+ fq_put(fq, &work);
}
}
@@ -298,7 +309,7 @@ static void nf_ct_frag6_expire(unsigned long data)
out:
spin_unlock(&fq->lock);
- fq_put(fq);
+ fq_put(fq, NULL);
}
/* Creation primitives. */
@@ -318,7 +329,7 @@ static struct nf_ct_frag6_queue *nf_ct_frag6_intern(unsigned int hash,
atomic_inc(&fq->refcnt);
write_unlock(&nf_ct_frag6_lock);
fq_in->last_in |= COMPLETE;
- fq_put(fq_in);
+ fq_put(fq_in, NULL);
return fq;
}
}
@@ -535,7 +546,7 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
fq->fragments = next;
fq->meat -= free_it->len;
- frag_kfree_skb(free_it);
+ frag_kfree_skb(free_it, NULL);
}
}
@@ -811,7 +822,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb)
if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) {
spin_unlock(&fq->lock);
DEBUGP("Can't insert skb to queue\n");
- fq_put(fq);
+ fq_put(fq, NULL);
goto ret_orig;
}
@@ -822,7 +833,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb)
}
spin_unlock(&fq->lock);
- fq_put(fq);
+ fq_put(fq, NULL);
return ret_skb;
ret_orig:
@@ -881,5 +892,6 @@ int nf_ct_frag6_init(void)
void nf_ct_frag6_cleanup(void)
{
del_timer(&nf_ct_frag6_secret_timer);
+ nf_ct_frag6_low_thresh = 0;
nf_ct_frag6_evictor();
}
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 651c79b41ee..a66900cda2a 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -298,13 +298,10 @@ void rawv6_err(struct sock *sk, struct sk_buff *skb,
static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb)
{
if ((raw6_sk(sk)->checksum || sk->sk_filter) &&
- skb->ip_summed != CHECKSUM_UNNECESSARY) {
- if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) {
- /* FIXME: increment a raw6 drops counter here */
- kfree_skb(skb);
- return 0;
- }
- skb->ip_summed = CHECKSUM_UNNECESSARY;
+ skb_checksum_complete(skb)) {
+ /* FIXME: increment a raw6 drops counter here */
+ kfree_skb(skb);
+ return 0;
}
/* Charge it to the socket. */
@@ -337,32 +334,25 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
if (!rp->checksum)
skb->ip_summed = CHECKSUM_UNNECESSARY;
- if (skb->ip_summed != CHECKSUM_UNNECESSARY) {
- if (skb->ip_summed == CHECKSUM_HW) {
- skb_postpull_rcsum(skb, skb->nh.raw,
- skb->h.raw - skb->nh.raw);
+ if (skb->ip_summed == CHECKSUM_HW) {
+ skb_postpull_rcsum(skb, skb->nh.raw,
+ skb->h.raw - skb->nh.raw);
+ if (!csum_ipv6_magic(&skb->nh.ipv6h->saddr,
+ &skb->nh.ipv6h->daddr,
+ skb->len, inet->num, skb->csum))
skb->ip_summed = CHECKSUM_UNNECESSARY;
- if (csum_ipv6_magic(&skb->nh.ipv6h->saddr,
- &skb->nh.ipv6h->daddr,
- skb->len, inet->num, skb->csum)) {
- LIMIT_NETDEBUG(KERN_DEBUG "raw v6 hw csum failure.\n");
- skb->ip_summed = CHECKSUM_NONE;
- }
- }
- if (skb->ip_summed == CHECKSUM_NONE)
- skb->csum = ~csum_ipv6_magic(&skb->nh.ipv6h->saddr,
- &skb->nh.ipv6h->daddr,
- skb->len, inet->num, 0);
}
+ if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+ skb->csum = ~csum_ipv6_magic(&skb->nh.ipv6h->saddr,
+ &skb->nh.ipv6h->daddr,
+ skb->len, inet->num, 0);
if (inet->hdrincl) {
- if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
- (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) {
+ if (skb_checksum_complete(skb)) {
/* FIXME: increment a raw6 drops counter here */
kfree_skb(skb);
return 0;
}
- skb->ip_summed = CHECKSUM_UNNECESSARY;
}
rawv6_rcv_skb(sk, skb);
@@ -407,7 +397,7 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk,
if (skb->ip_summed==CHECKSUM_UNNECESSARY) {
err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
} else if (msg->msg_flags&MSG_TRUNC) {
- if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)))
+ if (__skb_checksum_complete(skb))
goto csum_copy_err;
err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
} else {
@@ -758,7 +748,9 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
}
if (opt == NULL)
opt = np->opt;
- opt = fl6_merge_options(&opt_space, flowlabel, opt);
+ if (flowlabel)
+ opt = fl6_merge_options(&opt_space, flowlabel, opt);
+ opt = ipv6_fixup_options(&opt_space, opt);
fl.proto = proto;
rawv6_probe_proto_opt(&fl, msg);
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index e4fe9ee484d..5d316cb72ec 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -74,7 +74,7 @@ struct ip6frag_skb_cb
struct frag_queue
{
- struct frag_queue *next;
+ struct hlist_node list;
struct list_head lru_list; /* lru list member */
__u32 id; /* fragment id */
@@ -95,14 +95,13 @@ struct frag_queue
#define FIRST_IN 2
#define LAST_IN 1
__u16 nhoffset;
- struct frag_queue **pprev;
};
/* Hash table. */
#define IP6Q_HASHSZ 64
-static struct frag_queue *ip6_frag_hash[IP6Q_HASHSZ];
+static struct hlist_head ip6_frag_hash[IP6Q_HASHSZ];
static DEFINE_RWLOCK(ip6_frag_lock);
static u32 ip6_frag_hash_rnd;
static LIST_HEAD(ip6_frag_lru_list);
@@ -110,9 +109,7 @@ int ip6_frag_nqueues = 0;
static __inline__ void __fq_unlink(struct frag_queue *fq)
{
- if(fq->next)
- fq->next->pprev = fq->pprev;
- *fq->pprev = fq->next;
+ hlist_del(&fq->list);
list_del(&fq->lru_list);
ip6_frag_nqueues--;
}
@@ -163,28 +160,21 @@ static void ip6_frag_secret_rebuild(unsigned long dummy)
get_random_bytes(&ip6_frag_hash_rnd, sizeof(u32));
for (i = 0; i < IP6Q_HASHSZ; i++) {
struct frag_queue *q;
+ struct hlist_node *p, *n;
- q = ip6_frag_hash[i];
- while (q) {
- struct frag_queue *next = q->next;
+ hlist_for_each_entry_safe(q, p, n, &ip6_frag_hash[i], list) {
unsigned int hval = ip6qhashfn(q->id,
&q->saddr,
&q->daddr);
if (hval != i) {
- /* Unlink. */
- if (q->next)
- q->next->pprev = q->pprev;
- *q->pprev = q->next;
+ hlist_del(&q->list);
/* Relink to new hash chain. */
- if ((q->next = ip6_frag_hash[hval]) != NULL)
- q->next->pprev = &q->next;
- ip6_frag_hash[hval] = q;
- q->pprev = &ip6_frag_hash[hval];
- }
+ hlist_add_head(&q->list,
+ &ip6_frag_hash[hval]);
- q = next;
+ }
}
}
write_unlock(&ip6_frag_lock);
@@ -337,10 +327,13 @@ static struct frag_queue *ip6_frag_intern(unsigned int hash,
struct frag_queue *fq_in)
{
struct frag_queue *fq;
+#ifdef CONFIG_SMP
+ struct hlist_node *n;
+#endif
write_lock(&ip6_frag_lock);
#ifdef CONFIG_SMP
- for (fq = ip6_frag_hash[hash]; fq; fq = fq->next) {
+ hlist_for_each_entry(fq, n, &ip6_frag_hash[hash], list) {
if (fq->id == fq_in->id &&
ipv6_addr_equal(&fq_in->saddr, &fq->saddr) &&
ipv6_addr_equal(&fq_in->daddr, &fq->daddr)) {
@@ -358,10 +351,7 @@ static struct frag_queue *ip6_frag_intern(unsigned int hash,
atomic_inc(&fq->refcnt);
atomic_inc(&fq->refcnt);
- if((fq->next = ip6_frag_hash[hash]) != NULL)
- fq->next->pprev = &fq->next;
- ip6_frag_hash[hash] = fq;
- fq->pprev = &ip6_frag_hash[hash];
+ hlist_add_head(&fq->list, &ip6_frag_hash[hash]);
INIT_LIST_HEAD(&fq->lru_list);
list_add_tail(&fq->lru_list, &ip6_frag_lru_list);
ip6_frag_nqueues++;
@@ -401,10 +391,11 @@ static __inline__ struct frag_queue *
fq_find(u32 id, struct in6_addr *src, struct in6_addr *dst)
{
struct frag_queue *fq;
+ struct hlist_node *n;
unsigned int hash = ip6qhashfn(id, src, dst);
read_lock(&ip6_frag_lock);
- for(fq = ip6_frag_hash[hash]; fq; fq = fq->next) {
+ hlist_for_each_entry(fq, n, &ip6_frag_hash[hash], list) {
if (fq->id == id &&
ipv6_addr_equal(src, &fq->saddr) &&
ipv6_addr_equal(dst, &fq->daddr)) {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f7f42c3e96c..66140f13d11 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -413,11 +413,14 @@ static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr,
rt = ip6_rt_copy(ort);
if (rt) {
- ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
-
- if (!(rt->rt6i_flags&RTF_GATEWAY))
+ if (!(rt->rt6i_flags&RTF_GATEWAY)) {
+ if (rt->rt6i_dst.plen != 128 &&
+ ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
+ rt->rt6i_flags |= RTF_ANYCAST;
ipv6_addr_copy(&rt->rt6i_gateway, daddr);
+ }
+ ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
rt->rt6i_dst.plen = 128;
rt->rt6i_flags |= RTF_CACHE;
rt->u.dst.flags |= DST_HOST;
@@ -829,7 +832,7 @@ int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
}
rt->u.dst.obsolete = -1;
- rt->rt6i_expires = clock_t_to_jiffies(rtmsg->rtmsg_info);
+ rt->rt6i_expires = jiffies + clock_t_to_jiffies(rtmsg->rtmsg_info);
if (nlh && (r = NLMSG_DATA(nlh))) {
rt->rt6i_protocol = r->rtm_protocol;
} else {
@@ -1413,7 +1416,9 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
rt->u.dst.obsolete = -1;
rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
- if (!anycast)
+ if (anycast)
+ rt->rt6i_flags |= RTF_ANYCAST;
+ else
rt->rt6i_flags |= RTF_LOCAL;
rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
if (rt->rt6i_nexthop == NULL) {
@@ -1701,10 +1706,8 @@ static void fib6_dump_end(struct netlink_callback *cb)
fib6_walker_unlink(w);
kfree(w);
}
- if (cb->args[1]) {
- cb->done = (void*)cb->args[1];
- cb->args[1] = 0;
- }
+ cb->done = (void*)cb->args[1];
+ cb->args[1] = 0;
}
static int fib6_dump_done(struct netlink_callback *cb)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d746d3b27ef..8827389abaf 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -992,13 +992,12 @@ static void tcp_v6_send_reset(struct sk_buff *skb)
/* sk = NULL, but it is safe for now. RST socket required. */
if (!ip6_dst_lookup(NULL, &buff->dst, &fl)) {
- if ((xfrm_lookup(&buff->dst, &fl, NULL, 0)) < 0)
+ if (xfrm_lookup(&buff->dst, &fl, NULL, 0) >= 0) {
+ ip6_xmit(NULL, buff, &fl, NULL, 0);
+ TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
+ TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
return;
-
- ip6_xmit(NULL, buff, &fl, NULL, 0);
- TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
- TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
- return;
+ }
}
kfree_skb(buff);
@@ -1057,11 +1056,11 @@ static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32
fl.fl_ip_sport = t1->source;
if (!ip6_dst_lookup(NULL, &buff->dst, &fl)) {
- if ((xfrm_lookup(&buff->dst, &fl, NULL, 0)) < 0)
+ if (xfrm_lookup(&buff->dst, &fl, NULL, 0) >= 0) {
+ ip6_xmit(NULL, buff, &fl, NULL, 0);
+ TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
return;
- ip6_xmit(NULL, buff, &fl, NULL, 0);
- TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
- return;
+ }
}
kfree_skb(buff);
@@ -1401,20 +1400,18 @@ out:
static int tcp_v6_checksum_init(struct sk_buff *skb)
{
if (skb->ip_summed == CHECKSUM_HW) {
- skb->ip_summed = CHECKSUM_UNNECESSARY;
if (!tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr,
- &skb->nh.ipv6h->daddr,skb->csum))
+ &skb->nh.ipv6h->daddr,skb->csum)) {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
return 0;
- LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v6 csum failed\n");
+ }
}
+
+ skb->csum = ~tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr,
+ &skb->nh.ipv6h->daddr, 0);
+
if (skb->len <= 76) {
- if (tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr,
- &skb->nh.ipv6h->daddr,skb_checksum(skb, 0, skb->len, 0)))
- return -1;
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- } else {
- skb->csum = ~tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr,
- &skb->nh.ipv6h->daddr,0);
+ return __skb_checksum_complete(skb);
}
return 0;
}
@@ -1575,7 +1572,7 @@ static int tcp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
goto discard_it;
if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
- tcp_v6_checksum_init(skb) < 0))
+ tcp_v6_checksum_init(skb)))
goto bad_packet;
th = skb->h.th;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index bf9519341fd..5cc8731eb55 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -248,7 +248,7 @@ try_again:
err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
copied);
} else if (msg->msg_flags&MSG_TRUNC) {
- if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)))
+ if (__skb_checksum_complete(skb))
goto csum_copy_err;
err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
copied);
@@ -363,13 +363,10 @@ static inline int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
return -1;
}
- if (skb->ip_summed != CHECKSUM_UNNECESSARY) {
- if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) {
- UDP6_INC_STATS_BH(UDP_MIB_INERRORS);
- kfree_skb(skb);
- return 0;
- }
- skb->ip_summed = CHECKSUM_UNNECESSARY;
+ if (skb_checksum_complete(skb)) {
+ UDP6_INC_STATS_BH(UDP_MIB_INERRORS);
+ kfree_skb(skb);
+ return 0;
}
if (sock_queue_rcv_skb(sk,skb)<0) {
@@ -491,13 +488,10 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
uh = skb->h.uh;
}
- if (skb->ip_summed==CHECKSUM_HW) {
+ if (skb->ip_summed == CHECKSUM_HW &&
+ !csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum))
skb->ip_summed = CHECKSUM_UNNECESSARY;
- if (csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) {
- LIMIT_NETDEBUG(KERN_DEBUG "udp v6 hw csum failure.\n");
- skb->ip_summed = CHECKSUM_NONE;
- }
- }
+
if (skb->ip_summed != CHECKSUM_UNNECESSARY)
skb->csum = ~csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, 0);
@@ -521,8 +515,7 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
goto discard;
- if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
- (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)))
+ if (skb_checksum_complete(skb))
goto discard;
UDP6_INC_STATS_BH(UDP_MIB_NOPORTS);
@@ -778,7 +771,9 @@ do_udp_sendmsg:
}
if (opt == NULL)
opt = np->opt;
- opt = fl6_merge_options(&opt_space, flowlabel, opt);
+ if (flowlabel)
+ opt = fl6_merge_options(&opt_space, flowlabel, opt);
+ opt = ipv6_fixup_options(&opt_space, opt);
fl->proto = IPPROTO_UDP;
ipv6_addr_copy(&fl->fl6_dst, daddr);
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index cf1d91e74c8..69bd957380e 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -214,6 +214,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl)
case IPPROTO_UDP:
case IPPROTO_TCP:
case IPPROTO_SCTP:
+ case IPPROTO_DCCP:
if (pskb_may_pull(skb, skb->nh.raw + offset + 4 - skb->data)) {
u16 *ports = (u16 *)exthdr;
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 59d02cbbeb9..c3f0b078345 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -116,7 +116,9 @@ static int llc_ui_send_data(struct sock* sk, struct sk_buff *skb, int noblock)
struct llc_sock* llc = llc_sk(sk);
int rc = 0;
- if (unlikely(llc_data_accept_state(llc->state) || llc->p_flag)) {
+ if (unlikely(llc_data_accept_state(llc->state) ||
+ llc->remote_busy_flag ||
+ llc->p_flag)) {
long timeout = sock_sndtimeo(sk, noblock);
rc = llc_ui_wait_for_busy_core(sk, timeout);
@@ -542,6 +544,7 @@ static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout)
if (sk_wait_event(sk, &timeout,
(sk->sk_shutdown & RCV_SHUTDOWN) ||
(!llc_data_accept_state(llc->state) &&
+ !llc->remote_busy_flag &&
!llc->p_flag)))
break;
rc = -ERESTARTSYS;
diff --git a/net/llc/llc_c_ac.c b/net/llc/llc_c_ac.c
index b0bcfb1f12d..8169f24ed33 100644
--- a/net/llc/llc_c_ac.c
+++ b/net/llc/llc_c_ac.c
@@ -866,7 +866,8 @@ int llc_conn_ac_send_ack_if_needed(struct sock *sk, struct sk_buff *skb)
llc->ack_must_be_send = 1;
llc->ack_pf = pf_bit & 1;
}
- if (((llc->vR - llc->first_pdu_Ns + 129) % 128) >= llc->npta) {
+ if (((llc->vR - llc->first_pdu_Ns + 1 + LLC_2_SEQ_NBR_MODULO)
+ % LLC_2_SEQ_NBR_MODULO) >= llc->npta) {
llc_conn_ac_send_rr_rsp_f_set_ackpf(sk, skb);
llc->ack_must_be_send = 0;
llc->ack_pf = 0;
@@ -994,8 +995,8 @@ static int llc_conn_ac_inc_npta_value(struct sock *sk, struct sk_buff *skb)
llc->dec_step = 0;
llc->dec_cntr = llc->inc_cntr = 2;
++llc->npta;
- if (llc->npta > 127)
- llc->npta = 127 ;
+ if (llc->npta > (u8) ~LLC_2_SEQ_NBR_MODULO)
+ llc->npta = (u8) ~LLC_2_SEQ_NBR_MODULO;
} else
--llc->inc_cntr;
return 0;
@@ -1065,9 +1066,10 @@ int llc_conn_ac_dec_tx_win_size(struct sock *sk, struct sk_buff *skb)
struct llc_sock *llc = llc_sk(sk);
u8 unacked_pdu = skb_queue_len(&llc->pdu_unack_q);
- llc->k -= unacked_pdu;
- if (llc->k < 2)
- llc->k = 2;
+ if (llc->k - unacked_pdu < 1)
+ llc->k = 1;
+ else
+ llc->k -= unacked_pdu;
return 0;
}
@@ -1084,8 +1086,8 @@ int llc_conn_ac_inc_tx_win_size(struct sock *sk, struct sk_buff *skb)
struct llc_sock *llc = llc_sk(sk);
llc->k += 1;
- if (llc->k > 128)
- llc->k = 128 ;
+ if (llc->k > (u8) ~LLC_2_SEQ_NBR_MODULO)
+ llc->k = (u8) ~LLC_2_SEQ_NBR_MODULO;
return 0;
}
@@ -1309,7 +1311,7 @@ int llc_conn_ac_set_vs_nr(struct sock *sk, struct sk_buff *skb)
static int llc_conn_ac_inc_vs_by_1(struct sock *sk, struct sk_buff *skb)
{
- llc_sk(sk)->vS = (llc_sk(sk)->vS + 1) % 128;
+ llc_sk(sk)->vS = (llc_sk(sk)->vS + 1) % LLC_2_SEQ_NBR_MODULO;
return 0;
}
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index a84f9221e5f..794c41d19b2 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -61,8 +61,8 @@ config NF_CONNTRACK_MARK
instead of the individual packets.
config NF_CONNTRACK_EVENTS
- bool "Connection tracking events"
- depends on NF_CONNTRACK
+ bool "Connection tracking events (EXPERIMENTAL)"
+ depends on EXPERIMENTAL && NF_CONNTRACK
help
If this option is enabled, the connection tracking code will
provide a notifier chain that can be used by other kernel code
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 9a67c796b38..a7c7b490cf2 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -387,7 +387,7 @@ nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
static void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
{
ASSERT_WRITE_LOCK(&nf_conntrack_lock);
- NF_CT_ASSERT(!timer_pending(&exp_timeout));
+ NF_CT_ASSERT(!timer_pending(&exp->timeout));
list_del(&exp->list);
NF_CT_STAT_INC(expect_delete);
exp->master->expecting--;
@@ -1383,6 +1383,9 @@ void nf_conntrack_cleanup(void)
schedule();
goto i_see_dead_people;
}
+ /* wait until all references to nf_conntrack_untracked are dropped */
+ while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1)
+ schedule();
for (i = 0; i < NF_CT_F_NUM; i++) {
if (nf_ct_cache[i].use == 0)
@@ -1395,6 +1398,13 @@ void nf_conntrack_cleanup(void)
kmem_cache_destroy(nf_conntrack_expect_cachep);
free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
nf_conntrack_htable_size);
+
+ /* free l3proto protocol tables */
+ for (i = 0; i < PF_MAX; i++)
+ if (nf_ct_protos[i]) {
+ kfree(nf_ct_protos[i]);
+ nf_ct_protos[i] = NULL;
+ }
}
static struct list_head *alloc_hashtable(int size, int *vmalloced)
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 83d90dd624f..6035633d822 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -280,9 +280,9 @@ static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
* sCL -> sCL
*/
/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
-/*ack*/ { sIV, sIV, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV },
+/*ack*/ { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV },
/*
- * sSS -> sIV Might be a half-open connection.
+ * sSS -> sIG Might be a half-open connection.
* sSR -> sSR Might answer late resent SYN.
* sES -> sES :-)
* sFW -> sCW Normal close request answered by ACK.
@@ -779,6 +779,7 @@ static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] =
{
[TH_SYN] = 1,
[TH_SYN|TH_ACK] = 1,
+ [TH_SYN|TH_PUSH] = 1,
[TH_SYN|TH_ACK|TH_PUSH] = 1,
[TH_RST] = 1,
[TH_RST|TH_ACK] = 1,
@@ -911,8 +912,12 @@ static int tcp_packet(struct nf_conn *conntrack,
switch (new_state) {
case TCP_CONNTRACK_IGNORE:
- /* Either SYN in ORIGINAL
- * or SYN/ACK in REPLY. */
+ /* Ignored packets:
+ *
+ * a) SYN in ORIGINAL
+ * b) SYN/ACK in REPLY
+ * c) ACK in reply direction after initial SYN in original.
+ */
if (index == TCP_SYNACK_SET
&& conntrack->proto.tcp.last_index == TCP_SYN_SET
&& conntrack->proto.tcp.last_dir != dir
@@ -969,16 +974,29 @@ static int tcp_packet(struct nf_conn *conntrack,
conntrack->timeout.function((unsigned long)
conntrack);
return -NF_REPEAT;
+ } else {
+ write_unlock_bh(&tcp_lock);
+ if (LOG_INVALID(IPPROTO_TCP))
+ nf_log_packet(pf, 0, skb, NULL, NULL,
+ NULL, "nf_ct_tcp: invalid SYN");
+ return -NF_ACCEPT;
}
case TCP_CONNTRACK_CLOSE:
if (index == TCP_RST_SET
- && test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)
- && conntrack->proto.tcp.last_index == TCP_SYN_SET
+ && ((test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)
+ && conntrack->proto.tcp.last_index == TCP_SYN_SET)
+ || (!test_bit(IPS_ASSURED_BIT, &conntrack->status)
+ && conntrack->proto.tcp.last_index == TCP_ACK_SET))
&& ntohl(th->ack_seq) == conntrack->proto.tcp.last_end) {
- /* RST sent to invalid SYN we had let trough
- * SYN was in window then, tear down connection.
+ /* RST sent to invalid SYN or ACK we had let trough
+ * at a) and c) above:
+ *
+ * a) SYN was in window then
+ * c) we hold a half-open connection.
+ *
+ * Delete our connection entry.
* We skip window checking, because packet might ACK
- * segments we ignored in the SYN. */
+ * segments we ignored. */
goto in_window;
}
/* Just fall trough */
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 45224db4fe2..5af381f9fe3 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -694,7 +694,7 @@ static int init_or_cleanup(int init)
cleanup_proc_stat:
#endif
#ifdef CONFIG_PROC_FS
- proc_net_remove("nf_conntrack_stat");
+ remove_proc_entry("nf_conntrack", proc_net_stat);
cleanup_proc_exp:
proc_net_remove("nf_conntrack_expect");
cleanup_proc:
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 83f4c53030f..95fdf04f1d8 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -162,7 +162,7 @@ nfnetlink_check_attributes(struct nfnetlink_subsystem *subsys,
return -EINVAL;
}
- min_len = NLMSG_ALIGN(sizeof(struct nfgenmsg));
+ min_len = NLMSG_SPACE(sizeof(struct nfgenmsg));
if (unlikely(nlh->nlmsg_len < min_len))
return -EINVAL;
@@ -223,6 +223,12 @@ static inline int nfnetlink_rcv_msg(struct sk_buff *skb,
NFNL_SUBSYS_ID(nlh->nlmsg_type),
NFNL_MSG_TYPE(nlh->nlmsg_type));
+ if (!cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)) {
+ DEBUGP("missing CAP_NET_ADMIN\n");
+ *errp = -EPERM;
+ return -1;
+ }
+
/* Only requests are handled by kernel now. */
if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) {
DEBUGP("received non-request message\n");
@@ -230,8 +236,7 @@ static inline int nfnetlink_rcv_msg(struct sk_buff *skb,
}
/* All the messages must at least contain nfgenmsg */
- if (nlh->nlmsg_len <
- NLMSG_LENGTH(NLMSG_ALIGN(sizeof(struct nfgenmsg)))) {
+ if (nlh->nlmsg_len < NLMSG_SPACE(sizeof(struct nfgenmsg))) {
DEBUGP("received message was too short\n");
return 0;
}
@@ -240,15 +245,12 @@ static inline int nfnetlink_rcv_msg(struct sk_buff *skb,
ss = nfnetlink_get_subsys(type);
if (!ss) {
#ifdef CONFIG_KMOD
- if (cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)) {
- /* don't call nfnl_shunlock, since it would reenter
- * with further packet processing */
- up(&nfnl_sem);
- request_module("nfnetlink-subsys-%d",
- NFNL_SUBSYS_ID(type));
- nfnl_shlock();
- ss = nfnetlink_get_subsys(type);
- }
+ /* don't call nfnl_shunlock, since it would reenter
+ * with further packet processing */
+ up(&nfnl_sem);
+ request_module("nfnetlink-subsys-%d", NFNL_SUBSYS_ID(type));
+ nfnl_shlock();
+ ss = nfnetlink_get_subsys(type);
if (!ss)
#endif
goto err_inval;
@@ -260,13 +262,6 @@ static inline int nfnetlink_rcv_msg(struct sk_buff *skb,
goto err_inval;
}
- if (nc->cap_required &&
- !cap_raised(NETLINK_CB(skb).eff_cap, nc->cap_required)) {
- DEBUGP("permission denied for type %d\n", type);
- *errp = -EPERM;
- return -1;
- }
-
{
u_int16_t attr_count =
ss->cb[NFNL_MSG_TYPE(nlh->nlmsg_type)].attr_count;
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index d194676f365..cba63729313 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -862,11 +862,9 @@ out_put:
static struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = {
[NFULNL_MSG_PACKET] = { .call = nfulnl_recv_unsupp,
- .attr_count = NFULA_MAX,
- .cap_required = CAP_NET_ADMIN, },
+ .attr_count = NFULA_MAX, },
[NFULNL_MSG_CONFIG] = { .call = nfulnl_recv_config,
- .attr_count = NFULA_CFG_MAX,
- .cap_required = CAP_NET_ADMIN },
+ .attr_count = NFULA_CFG_MAX, },
};
static struct nfnetlink_subsystem nfulnl_subsys = {
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index f065a6c9495..f28460b61e4 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -931,14 +931,11 @@ out_put:
static struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = {
[NFQNL_MSG_PACKET] = { .call = nfqnl_recv_unsupp,
- .attr_count = NFQA_MAX,
- .cap_required = CAP_NET_ADMIN },
+ .attr_count = NFQA_MAX, },
[NFQNL_MSG_VERDICT] = { .call = nfqnl_recv_verdict,
- .attr_count = NFQA_MAX,
- .cap_required = CAP_NET_ADMIN },
+ .attr_count = NFQA_MAX, },
[NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config,
- .attr_count = NFQA_CFG_MAX,
- .cap_required = CAP_NET_ADMIN },
+ .attr_count = NFQA_CFG_MAX, },
};
static struct nfnetlink_subsystem nfqnl_subsys = {
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 8c38ee6d255..96020d7087e 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -476,7 +476,7 @@ static int netlink_autobind(struct socket *sock)
struct hlist_head *head;
struct sock *osk;
struct hlist_node *node;
- s32 pid = current->pid;
+ s32 pid = current->tgid;
int err;
static s32 rover = -4097;
diff --git a/net/netrom/nr_in.c b/net/netrom/nr_in.c
index 004e8599b8f..a7d88b5ad75 100644
--- a/net/netrom/nr_in.c
+++ b/net/netrom/nr_in.c
@@ -99,7 +99,7 @@ static int nr_state1_machine(struct sock *sk, struct sk_buff *skb,
break;
case NR_RESET:
- if (sysctl_netrom_reset_circuit);
+ if (sysctl_netrom_reset_circuit)
nr_disconnect(sk, ECONNRESET);
break;
@@ -130,7 +130,7 @@ static int nr_state2_machine(struct sock *sk, struct sk_buff *skb,
break;
case NR_RESET:
- if (sysctl_netrom_reset_circuit);
+ if (sysctl_netrom_reset_circuit)
nr_disconnect(sk, ECONNRESET);
break;
@@ -265,7 +265,7 @@ static int nr_state3_machine(struct sock *sk, struct sk_buff *skb, int frametype
break;
case NR_RESET:
- if (sysctl_netrom_reset_circuit);
+ if (sysctl_netrom_reset_circuit)
nr_disconnect(sk, ECONNRESET);
break;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 499ae3df4a4..3e246276041 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1587,23 +1587,47 @@ static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order)
return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1);
}
-static void free_pg_vec(char **pg_vec, unsigned order, unsigned len)
+static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
{
int i;
- for (i=0; i<len; i++) {
- if (pg_vec[i]) {
- struct page *page, *pend;
-
- pend = pg_vec_endpage(pg_vec[i], order);
- for (page = virt_to_page(pg_vec[i]); page <= pend; page++)
- ClearPageReserved(page);
- free_pages((unsigned long)pg_vec[i], order);
- }
+ for (i = 0; i < len; i++) {
+ if (likely(pg_vec[i]))
+ free_pages((unsigned long) pg_vec[i], order);
}
kfree(pg_vec);
}
+static inline char *alloc_one_pg_vec_page(unsigned long order)
+{
+ return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
+ order);
+}
+
+static char **alloc_pg_vec(struct tpacket_req *req, int order)
+{
+ unsigned int block_nr = req->tp_block_nr;
+ char **pg_vec;
+ int i;
+
+ pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
+ if (unlikely(!pg_vec))
+ goto out;
+
+ for (i = 0; i < block_nr; i++) {
+ pg_vec[i] = alloc_one_pg_vec_page(order);
+ if (unlikely(!pg_vec[i]))
+ goto out_free_pgvec;
+ }
+
+out:
+ return pg_vec;
+
+out_free_pgvec:
+ free_pg_vec(pg_vec, order, block_nr);
+ pg_vec = NULL;
+ goto out;
+}
static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
{
@@ -1617,64 +1641,46 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing
/* Sanity tests and some calculations */
- if (po->pg_vec)
+ if (unlikely(po->pg_vec))
return -EBUSY;
- if ((int)req->tp_block_size <= 0)
+ if (unlikely((int)req->tp_block_size <= 0))
return -EINVAL;
- if (req->tp_block_size&(PAGE_SIZE-1))
+ if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
return -EINVAL;
- if (req->tp_frame_size < TPACKET_HDRLEN)
+ if (unlikely(req->tp_frame_size < TPACKET_HDRLEN))
return -EINVAL;
- if (req->tp_frame_size&(TPACKET_ALIGNMENT-1))
+ if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
return -EINVAL;
po->frames_per_block = req->tp_block_size/req->tp_frame_size;
- if (po->frames_per_block <= 0)
+ if (unlikely(po->frames_per_block <= 0))
return -EINVAL;
- if (po->frames_per_block*req->tp_block_nr != req->tp_frame_nr)
+ if (unlikely((po->frames_per_block * req->tp_block_nr) !=
+ req->tp_frame_nr))
return -EINVAL;
- /* OK! */
-
- /* Allocate page vector */
- while ((PAGE_SIZE<<order) < req->tp_block_size)
- order++;
err = -ENOMEM;
-
- pg_vec = kmalloc(req->tp_block_nr*sizeof(char *), GFP_KERNEL);
- if (pg_vec == NULL)
+ order = get_order(req->tp_block_size);
+ pg_vec = alloc_pg_vec(req, order);
+ if (unlikely(!pg_vec))
goto out;
- memset(pg_vec, 0, req->tp_block_nr*sizeof(char **));
-
- for (i=0; i<req->tp_block_nr; i++) {
- struct page *page, *pend;
- pg_vec[i] = (char *)__get_free_pages(GFP_KERNEL, order);
- if (!pg_vec[i])
- goto out_free_pgvec;
-
- pend = pg_vec_endpage(pg_vec[i], order);
- for (page = virt_to_page(pg_vec[i]); page <= pend; page++)
- SetPageReserved(page);
- }
- /* Page vector is allocated */
l = 0;
- for (i=0; i<req->tp_block_nr; i++) {
+ for (i = 0; i < req->tp_block_nr; i++) {
char *ptr = pg_vec[i];
struct tpacket_hdr *header;
int k;
- for (k=0; k<po->frames_per_block; k++) {
-
- header = (struct tpacket_hdr*)ptr;
+ for (k = 0; k < po->frames_per_block; k++) {
+ header = (struct tpacket_hdr *) ptr;
header->tp_status = TP_STATUS_KERNEL;
ptr += req->tp_frame_size;
}
}
/* Done */
} else {
- if (req->tp_frame_nr)
+ if (unlikely(req->tp_frame_nr))
return -EINVAL;
}
@@ -1701,7 +1707,7 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing
spin_lock_bh(&sk->sk_receive_queue.lock);
pg_vec = XC(po->pg_vec, pg_vec);
- po->frame_max = req->tp_frame_nr-1;
+ po->frame_max = (req->tp_frame_nr - 1);
po->head = 0;
po->frame_size = req->tp_frame_size;
spin_unlock_bh(&sk->sk_receive_queue.lock);
@@ -1728,7 +1734,6 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing
release_sock(sk);
-out_free_pgvec:
if (pg_vec)
free_pg_vec(pg_vec, order, req->tp_block_nr);
out:
@@ -1755,17 +1760,19 @@ static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_st
if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
goto out;
- atomic_inc(&po->mapped);
start = vma->vm_start;
- err = -EAGAIN;
- for (i=0; i<po->pg_vec_len; i++) {
- if (remap_pfn_range(vma, start,
- __pa(po->pg_vec[i]) >> PAGE_SHIFT,
- po->pg_vec_pages*PAGE_SIZE,
- vma->vm_page_prot))
- goto out;
- start += po->pg_vec_pages*PAGE_SIZE;
+ for (i = 0; i < po->pg_vec_len; i++) {
+ struct page *page = virt_to_page(po->pg_vec[i]);
+ int pg_num;
+
+ for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) {
+ err = vm_insert_page(vma, start, page);
+ if (unlikely(err))
+ goto out;
+ start += PAGE_SIZE;
+ }
}
+ atomic_inc(&po->mapped);
vma->vm_ops = &packet_mmap_ops;
err = 0;
diff --git a/net/rxrpc/transport.c b/net/rxrpc/transport.c
index 122c086ee2d..dbe6105e83a 100644
--- a/net/rxrpc/transport.c
+++ b/net/rxrpc/transport.c
@@ -23,6 +23,7 @@
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/icmp.h>
+#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/ip.h>
#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
@@ -475,15 +476,11 @@ void rxrpc_trans_receive_packet(struct rxrpc_transport *trans)
/* we'll probably need to checksum it (didn't call
* sock_recvmsg) */
- if (pkt->ip_summed != CHECKSUM_UNNECESSARY) {
- if ((unsigned short)
- csum_fold(skb_checksum(pkt, 0, pkt->len,
- pkt->csum))) {
- kfree_skb(pkt);
- rxrpc_krxiod_queue_transport(trans);
- _leave(" CSUM failed");
- return;
- }
+ if (skb_checksum_complete(pkt)) {
+ kfree_skb(pkt);
+ rxrpc_krxiod_queue_transport(trans);
+ _leave(" CSUM failed");
+ return;
}
addr = pkt->nh.iph->saddr;
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 7f34e7fd767..55cd5327fbd 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -40,9 +40,10 @@ config NET_SCHED
The available schedulers are listed in the following questions; you
can say Y to as many as you like. If unsure, say N now.
+if NET_SCHED
+
choice
prompt "Packet scheduler clock source"
- depends on NET_SCHED
default NET_SCH_CLK_JIFFIES
---help---
Packet schedulers need a monotonic clock that increments at a static
@@ -98,11 +99,9 @@ config NET_SCH_CLK_CPU
endchoice
comment "Queueing/Scheduling"
- depends on NET_SCHED
config NET_SCH_CBQ
tristate "Class Based Queueing (CBQ)"
- depends on NET_SCHED
---help---
Say Y here if you want to use the Class-Based Queueing (CBQ) packet
scheduling algorithm. This algorithm classifies the waiting packets
@@ -120,7 +119,6 @@ config NET_SCH_CBQ
config NET_SCH_HTB
tristate "Hierarchical Token Bucket (HTB)"
- depends on NET_SCHED
---help---
Say Y here if you want to use the Hierarchical Token Buckets (HTB)
packet scheduling algorithm. See
@@ -135,7 +133,6 @@ config NET_SCH_HTB
config NET_SCH_HFSC
tristate "Hierarchical Fair Service Curve (HFSC)"
- depends on NET_SCHED
---help---
Say Y here if you want to use the Hierarchical Fair Service Curve
(HFSC) packet scheduling algorithm.
@@ -145,7 +142,7 @@ config NET_SCH_HFSC
config NET_SCH_ATM
tristate "ATM Virtual Circuits (ATM)"
- depends on NET_SCHED && ATM
+ depends on ATM
---help---
Say Y here if you want to use the ATM pseudo-scheduler. This
provides a framework for invoking classifiers, which in turn
@@ -159,7 +156,6 @@ config NET_SCH_ATM
config NET_SCH_PRIO
tristate "Multi Band Priority Queueing (PRIO)"
- depends on NET_SCHED
---help---
Say Y here if you want to use an n-band priority queue packet
scheduler.
@@ -169,7 +165,6 @@ config NET_SCH_PRIO
config NET_SCH_RED
tristate "Random Early Detection (RED)"
- depends on NET_SCHED
---help---
Say Y here if you want to use the Random Early Detection (RED)
packet scheduling algorithm.
@@ -181,7 +176,6 @@ config NET_SCH_RED
config NET_SCH_SFQ
tristate "Stochastic Fairness Queueing (SFQ)"
- depends on NET_SCHED
---help---
Say Y here if you want to use the Stochastic Fairness Queueing (SFQ)
packet scheduling algorithm .
@@ -193,7 +187,6 @@ config NET_SCH_SFQ
config NET_SCH_TEQL
tristate "True Link Equalizer (TEQL)"
- depends on NET_SCHED
---help---
Say Y here if you want to use the True Link Equalizer (TLE) packet
scheduling algorithm. This queueing discipline allows the combination
@@ -206,7 +199,6 @@ config NET_SCH_TEQL
config NET_SCH_TBF
tristate "Token Bucket Filter (TBF)"
- depends on NET_SCHED
---help---
Say Y here if you want to use the Token Bucket Filter (TBF) packet
scheduling algorithm.
@@ -218,7 +210,6 @@ config NET_SCH_TBF
config NET_SCH_GRED
tristate "Generic Random Early Detection (GRED)"
- depends on NET_SCHED
---help---
Say Y here if you want to use the Generic Random Early Detection
(GRED) packet scheduling algorithm for some of your network devices
@@ -230,7 +221,6 @@ config NET_SCH_GRED
config NET_SCH_DSMARK
tristate "Differentiated Services marker (DSMARK)"
- depends on NET_SCHED
---help---
Say Y if you want to schedule packets according to the
Differentiated Services architecture proposed in RFC 2475.
@@ -242,7 +232,6 @@ config NET_SCH_DSMARK
config NET_SCH_NETEM
tristate "Network emulator (NETEM)"
- depends on NET_SCHED
---help---
Say Y if you want to emulate network delay, loss, and packet
re-ordering. This is often useful to simulate networks when
@@ -255,7 +244,6 @@ config NET_SCH_NETEM
config NET_SCH_INGRESS
tristate "Ingress Qdisc"
- depends on NET_SCHED
---help---
Say Y here if you want to use classifiers for incoming packets.
If unsure, say Y.
@@ -264,14 +252,12 @@ config NET_SCH_INGRESS
module will be called sch_ingress.
comment "Classification"
- depends on NET_SCHED
config NET_CLS
boolean
config NET_CLS_BASIC
tristate "Elementary classification (BASIC)"
- depends NET_SCHED
select NET_CLS
---help---
Say Y here if you want to be able to classify packets using
@@ -282,7 +268,6 @@ config NET_CLS_BASIC
config NET_CLS_TCINDEX
tristate "Traffic-Control Index (TCINDEX)"
- depends NET_SCHED
select NET_CLS
---help---
Say Y here if you want to be able to classify packets based on
@@ -294,7 +279,6 @@ config NET_CLS_TCINDEX
config NET_CLS_ROUTE4
tristate "Routing decision (ROUTE)"
- depends NET_SCHED
select NET_CLS_ROUTE
select NET_CLS
---help---
@@ -306,11 +290,9 @@ config NET_CLS_ROUTE4
config NET_CLS_ROUTE
bool
- default n
config NET_CLS_FW
tristate "Netfilter mark (FW)"
- depends NET_SCHED
select NET_CLS
---help---
If you say Y here, you will be able to classify packets
@@ -321,7 +303,6 @@ config NET_CLS_FW
config NET_CLS_U32
tristate "Universal 32bit comparisons w/ hashing (U32)"
- depends NET_SCHED
select NET_CLS
---help---
Say Y here to be able to classify packetes using a universal
@@ -345,7 +326,6 @@ config CLS_U32_MARK
config NET_CLS_RSVP
tristate "IPv4 Resource Reservation Protocol (RSVP)"
- depends on NET_SCHED
select NET_CLS
select NET_ESTIMATOR
---help---
@@ -361,7 +341,6 @@ config NET_CLS_RSVP
config NET_CLS_RSVP6
tristate "IPv6 Resource Reservation Protocol (RSVP6)"
- depends on NET_SCHED
select NET_CLS
select NET_ESTIMATOR
---help---
@@ -377,7 +356,6 @@ config NET_CLS_RSVP6
config NET_EMATCH
bool "Extended Matches"
- depends NET_SCHED
select NET_CLS
---help---
Say Y here if you want to use extended matches on top of classifiers
@@ -456,7 +434,7 @@ config NET_EMATCH_TEXT
config NET_CLS_ACT
bool "Actions"
- depends on EXPERIMENTAL && NET_SCHED
+ depends on EXPERIMENTAL
select NET_ESTIMATOR
---help---
Say Y here if you want to use traffic control actions. Actions
@@ -539,7 +517,7 @@ config NET_ACT_SIMP
config NET_CLS_POLICE
bool "Traffic Policing (obsolete)"
- depends on NET_SCHED && NET_CLS_ACT!=y
+ depends on NET_CLS_ACT!=y
select NET_ESTIMATOR
---help---
Say Y here if you want to do traffic policing, i.e. strict
@@ -549,7 +527,7 @@ config NET_CLS_POLICE
config NET_CLS_IND
bool "Incoming device classification"
- depends on NET_SCHED && (NET_CLS_U32 || NET_CLS_FW)
+ depends on NET_CLS_U32 || NET_CLS_FW
---help---
Say Y here to extend the u32 and fw classifier to support
classification based on the incoming device. This option is
@@ -557,11 +535,12 @@ config NET_CLS_IND
config NET_ESTIMATOR
bool "Rate estimator"
- depends on NET_SCHED
---help---
Say Y here to allow using rate estimators to estimate the current
rate-of-flow for network devices, queues, etc. This module is
automaticaly selected if needed but can be selected manually for
statstical purposes.
+endif # NET_SCHED
+
endmenu
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 8aebe8f6d27..2ce1cb2aa2e 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -34,7 +34,7 @@
#include <net/sch_generic.h>
#include <net/act_api.h>
-#if 1 /* control */
+#if 0 /* control */
#define DPRINTK(format, args...) printk(KERN_DEBUG format, ##args)
#else
#define DPRINTK(format, args...)
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index cdc8d283791..82fb07aa06a 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -464,7 +464,7 @@ static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
const struct netem_skb_cb *cb
= (const struct netem_skb_cb *)skb->cb;
- if (PSCHED_TLESS(cb->time_to_send, ncb->time_to_send))
+ if (!PSCHED_TLESS(ncb->time_to_send, cb->time_to_send))
break;
}
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 8c8ddf7f9b6..dec68a60477 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -128,9 +128,29 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
*/
asoc->max_burst = sctp_max_burst;
- /* Copy things from the endpoint. */
+ /* initialize association timers */
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0;
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = asoc->rto_initial;
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = asoc->rto_initial;
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = asoc->rto_initial;
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] = 0;
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_T4_RTO] = 0;
+
+ /* sctpimpguide Section 2.12.2
+ * If the 'T5-shutdown-guard' timer is used, it SHOULD be set to the
+ * recommended value of 5 times 'RTO.Max'.
+ */
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD]
+ = 5 * asoc->rto_max;
+
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0;
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] =
+ SCTP_DEFAULT_TIMEOUT_SACK;
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] =
+ sp->autoclose * HZ;
+
+ /* Initilizes the timers */
for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) {
- asoc->timeouts[i] = ep->timeouts[i];
init_timer(&asoc->timers[i]);
asoc->timers[i].function = sctp_timer_events[i];
asoc->timers[i].data = (unsigned long) asoc;
@@ -157,10 +177,10 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
* RFC 6 - A SCTP receiver MUST be able to receive a minimum of
* 1500 bytes in one SCTP packet.
*/
- if (sk->sk_rcvbuf < SCTP_DEFAULT_MINWINDOW)
+ if ((sk->sk_rcvbuf/2) < SCTP_DEFAULT_MINWINDOW)
asoc->rwnd = SCTP_DEFAULT_MINWINDOW;
else
- asoc->rwnd = sk->sk_rcvbuf;
+ asoc->rwnd = sk->sk_rcvbuf/2;
asoc->a_rwnd = asoc->rwnd;
@@ -172,6 +192,9 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
/* Set the sndbuf size for transmit. */
asoc->sndbuf_used = 0;
+ /* Initialize the receive memory counter */
+ atomic_set(&asoc->rmem_alloc, 0);
+
init_waitqueue_head(&asoc->wait);
asoc->c.my_vtag = sctp_generate_tag(ep);
@@ -380,6 +403,8 @@ static void sctp_association_destroy(struct sctp_association *asoc)
spin_unlock_bh(&sctp_assocs_id_lock);
}
+ BUG_TRAP(!atomic_read(&asoc->rmem_alloc));
+
if (asoc->base.malloced) {
kfree(asoc);
SCTP_DBG_OBJCNT_DEC(assoc);
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 96984f7a2d6..67bd53070ee 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -70,7 +70,6 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
struct sock *sk,
gfp_t gfp)
{
- struct sctp_sock *sp = sctp_sk(sk);
memset(ep, 0, sizeof(struct sctp_endpoint));
/* Initialize the base structure. */
@@ -100,33 +99,14 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
/* Create the lists of associations. */
INIT_LIST_HEAD(&ep->asocs);
- /* Set up the base timeout information. */
- ep->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0;
- ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] =
- msecs_to_jiffies(sp->rtoinfo.srto_initial);
- ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] =
- msecs_to_jiffies(sp->rtoinfo.srto_initial);
- ep->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] =
- msecs_to_jiffies(sp->rtoinfo.srto_initial);
- ep->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] = 0;
- ep->timeouts[SCTP_EVENT_TIMEOUT_T4_RTO] = 0;
-
- /* sctpimpguide-05 Section 2.12.2
- * If the 'T5-shutdown-guard' timer is used, it SHOULD be set to the
- * recommended value of 5 times 'RTO.Max'.
- */
- ep->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD]
- = 5 * msecs_to_jiffies(sp->rtoinfo.srto_max);
-
- ep->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0;
- ep->timeouts[SCTP_EVENT_TIMEOUT_SACK] = sctp_sack_timeout;
- ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ;
-
/* Use SCTP specific send buffer space queues. */
ep->sndbuf_policy = sctp_sndbuf_policy;
sk->sk_write_space = sctp_write_space;
sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+ /* Get the receive buffer policy for this endpoint */
+ ep->rcvbuf_policy = sctp_rcvbuf_policy;
+
/* Initialize the secret key used with cookie. */
get_random_bytes(&ep->secret_key[0], SCTP_SECRET_SIZE);
ep->last_key = ep->current_key = 0;
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 28f32243397..b24ff2c1aef 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -100,21 +100,6 @@ static inline int sctp_rcv_checksum(struct sk_buff *skb)
return 0;
}
-/* The free routine for skbuffs that sctp receives */
-static void sctp_rfree(struct sk_buff *skb)
-{
- atomic_sub(sizeof(struct sctp_chunk),&skb->sk->sk_rmem_alloc);
- sock_rfree(skb);
-}
-
-/* The ownership wrapper routine to do receive buffer accounting */
-static void sctp_rcv_set_owner_r(struct sk_buff *skb, struct sock *sk)
-{
- skb_set_owner_r(skb,sk);
- skb->destructor = sctp_rfree;
- atomic_add(sizeof(struct sctp_chunk),&sk->sk_rmem_alloc);
-}
-
struct sctp_input_cb {
union {
struct inet_skb_parm h4;
@@ -217,9 +202,6 @@ int sctp_rcv(struct sk_buff *skb)
rcvr = &ep->base;
}
- if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
- goto discard_release;
-
/*
* RFC 2960, 8.4 - Handle "Out of the blue" Packets.
* An SCTP packet is called an "out of the blue" (OOTB)
@@ -256,8 +238,6 @@ int sctp_rcv(struct sk_buff *skb)
}
SCTP_INPUT_CB(skb)->chunk = chunk;
- sctp_rcv_set_owner_r(skb,sk);
-
/* Remember what endpoint is to handle this packet. */
chunk->rcvr = rcvr;
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 26de4d3e1bd..f775d78aa59 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -530,6 +530,9 @@ static void sctp_v4_get_saddr(struct sctp_association *asoc,
{
struct rtable *rt = (struct rtable *)dst;
+ if (!asoc)
+ return;
+
if (rt) {
saddr->v4.sin_family = AF_INET;
saddr->v4.sin_port = asoc->base.bind_addr.port;
@@ -1047,6 +1050,9 @@ SCTP_STATIC __init int sctp_init(void)
/* Sendbuffer growth - do per-socket accounting */
sctp_sndbuf_policy = 0;
+ /* Rcvbuffer growth - do per-socket accounting */
+ sctp_rcvbuf_policy = 0;
+
/* HB.interval - 30 seconds */
sctp_hb_interval = SCTP_DEFAULT_TIMEOUT_HEARTBEAT;
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index f84173ea8ec..823947170a3 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -385,7 +385,7 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
NULL,
sctp_generate_t4_rto_event,
sctp_generate_t5_shutdown_guard_event,
- sctp_generate_heartbeat_event,
+ NULL,
sctp_generate_sack_event,
sctp_generate_autoclose_event,
};
@@ -689,9 +689,9 @@ static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds,
* increased due to timer expirations.
*/
asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] =
- asoc->ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT];
+ asoc->rto_initial;
asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] =
- asoc->ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE];
+ asoc->rto_initial;
}
if (sctp_state(asoc, ESTABLISHED) ||
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 505c7de10c5..475bfb4972d 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -5160,6 +5160,8 @@ static int sctp_eat_data(const struct sctp_association *asoc,
sctp_verb_t deliver;
int tmp;
__u32 tsn;
+ int account_value;
+ struct sock *sk = asoc->base.sk;
data_hdr = chunk->subh.data_hdr = (sctp_datahdr_t *)chunk->skb->data;
skb_pull(chunk->skb, sizeof(sctp_datahdr_t));
@@ -5169,6 +5171,26 @@ static int sctp_eat_data(const struct sctp_association *asoc,
/* ASSERT: Now skb->data is really the user data. */
+ /*
+ * if we are established, and we have used up our receive
+ * buffer memory, drop the frame
+ */
+ if (asoc->state == SCTP_STATE_ESTABLISHED) {
+ /*
+ * If the receive buffer policy is 1, then each
+ * association can allocate up to sk_rcvbuf bytes
+ * otherwise, all the associations in aggregate
+ * may allocate up to sk_rcvbuf bytes
+ */
+ if (asoc->ep->rcvbuf_policy)
+ account_value = atomic_read(&asoc->rmem_alloc);
+ else
+ account_value = atomic_read(&sk->sk_rmem_alloc);
+
+ if (account_value > sk->sk_rcvbuf)
+ return SCTP_IERROR_IGNORE_TSN;
+ }
+
/* Process ECN based congestion.
*
* Since the chunk structure is reused for all chunks within
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index b529af5e6f2..9df888e932c 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -156,10 +156,6 @@ static inline void sctp_set_owner_w(struct sctp_chunk *chunk)
sizeof(struct sk_buff) +
sizeof(struct sctp_chunk);
- sk->sk_wmem_queued += SCTP_DATA_SNDSIZE(chunk) +
- sizeof(struct sk_buff) +
- sizeof(struct sctp_chunk);
-
atomic_add(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc);
}
@@ -1932,7 +1928,6 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval,
if (copy_from_user(&sp->autoclose, optval, optlen))
return -EFAULT;
- sp->ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ;
return 0;
}
@@ -3426,7 +3421,7 @@ static int sctp_copy_laddrs_to_user_old(struct sock *sk, __u16 port, int max_add
}
static int sctp_copy_laddrs_to_user(struct sock *sk, __u16 port,
- void * __user *to, size_t space_left)
+ void __user **to, size_t space_left)
{
struct list_head *pos;
struct sctp_sockaddr_entry *addr;
@@ -4427,7 +4422,7 @@ cleanup:
* tcp_poll(). Note that, based on these implementations, we don't
* lock the socket in this function, even though it seems that,
* ideally, locking or some other mechanisms can be used to ensure
- * the integrity of the counters (sndbuf and wmem_queued) used
+ * the integrity of the counters (sndbuf and wmem_alloc) used
* in this place. We assume that we don't need locks either until proven
* otherwise.
*
@@ -4744,11 +4739,6 @@ static struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags,
struct sk_buff *skb;
long timeo;
- /* Caller is allowed not to check sk->sk_err before calling. */
- error = sock_error(sk);
- if (error)
- goto no_packet;
-
timeo = sock_rcvtimeo(sk, noblock);
SCTP_DEBUG_PRINTK("Timeout: timeo: %ld, MAX: %ld.\n",
@@ -4775,6 +4765,11 @@ static struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags,
if (skb)
return skb;
+ /* Caller is allowed not to check sk->sk_err before calling. */
+ error = sock_error(sk);
+ if (error)
+ goto no_packet;
+
if (sk->sk_shutdown & RCV_SHUTDOWN)
break;
@@ -4834,10 +4829,6 @@ static void sctp_wfree(struct sk_buff *skb)
sizeof(struct sk_buff) +
sizeof(struct sctp_chunk);
- sk->sk_wmem_queued -= SCTP_DATA_SNDSIZE(chunk) +
- sizeof(struct sk_buff) +
- sizeof(struct sctp_chunk);
-
atomic_sub(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc);
sock_wfree(skb);
@@ -4921,7 +4912,7 @@ void sctp_write_space(struct sock *sk)
/* Is there any sndbuf space available on the socket?
*
- * Note that wmem_queued is the sum of the send buffers on all of the
+ * Note that sk_wmem_alloc is the sum of the send buffers on all of the
* associations on the same socket. For a UDP-style socket with
* multiple associations, it is possible for it to be "unwriteable"
* prematurely. I assume that this is acceptable because
@@ -4934,7 +4925,7 @@ static int sctp_writeable(struct sock *sk)
{
int amt = 0;
- amt = sk->sk_sndbuf - sk->sk_wmem_queued;
+ amt = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc);
if (amt < 0)
amt = 0;
return amt;
@@ -5115,8 +5106,10 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
sctp_skb_for_each(skb, &oldsk->sk_receive_queue, tmp) {
event = sctp_skb2event(skb);
if (event->asoc == assoc) {
+ sock_rfree(skb);
__skb_unlink(skb, &oldsk->sk_receive_queue);
__skb_queue_tail(&newsk->sk_receive_queue, skb);
+ skb_set_owner_r(skb, newsk);
}
}
@@ -5144,8 +5137,10 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
sctp_skb_for_each(skb, &oldsp->pd_lobby, tmp) {
event = sctp_skb2event(skb);
if (event->asoc == assoc) {
+ sock_rfree(skb);
__skb_unlink(skb, &oldsp->pd_lobby);
__skb_queue_tail(queue, skb);
+ skb_set_owner_r(skb, newsk);
}
}
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 75b28dd634f..fcd7096c953 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -121,6 +121,14 @@ static ctl_table sctp_table[] = {
.proc_handler = &proc_dointvec
},
{
+ .ctl_name = NET_SCTP_RCVBUF_POLICY,
+ .procname = "rcvbuf_policy",
+ .data = &sctp_rcvbuf_policy,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
.ctl_name = NET_SCTP_PATH_MAX_RETRANS,
.procname = "path_max_retrans",
.data = &sctp_max_retrans_path,
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 6bc27200e6c..268ddaf2dc0 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -261,7 +261,8 @@ void sctp_transport_route(struct sctp_transport *transport,
* association's active path for getsockname().
*/
if (asoc && (transport == asoc->peer.active_path))
- af->to_sk_saddr(&transport->saddr, asoc->base.sk);
+ opt->pf->af->to_sk_saddr(&transport->saddr,
+ asoc->base.sk);
} else
transport->pmtu = SCTP_DEFAULT_MAXSEGMENT;
}
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index e049f41faa4..ba97f974f57 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -52,19 +52,6 @@ static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event,
struct sctp_association *asoc);
static void sctp_ulpevent_release_data(struct sctp_ulpevent *event);
-/* Stub skb destructor. */
-static void sctp_stub_rfree(struct sk_buff *skb)
-{
-/* WARNING: This function is just a warning not to use the
- * skb destructor. If the skb is shared, we may get the destructor
- * callback on some processor that does not own the sock_lock. This
- * was occuring with PACKET socket applications that were monitoring
- * our skbs. We can't take the sock_lock, because we can't risk
- * recursing if we do really own the sock lock. Instead, do all
- * of our rwnd manipulation while we own the sock_lock outright.
- */
-}
-
/* Initialize an ULP event from an given skb. */
SCTP_STATIC void sctp_ulpevent_init(struct sctp_ulpevent *event, int msg_flags)
{
@@ -111,15 +98,19 @@ static inline void sctp_ulpevent_set_owner(struct sctp_ulpevent *event,
*/
sctp_association_hold((struct sctp_association *)asoc);
skb = sctp_event2skb(event);
- skb->sk = asoc->base.sk;
event->asoc = (struct sctp_association *)asoc;
- skb->destructor = sctp_stub_rfree;
+ atomic_add(skb->truesize, &event->asoc->rmem_alloc);
+ skb_set_owner_r(skb, asoc->base.sk);
}
/* A simple destructor to give up the reference to the association. */
static inline void sctp_ulpevent_release_owner(struct sctp_ulpevent *event)
{
- sctp_association_put(event->asoc);
+ struct sctp_association *asoc = event->asoc;
+ struct sk_buff *skb = sctp_event2skb(event);
+
+ atomic_sub(skb->truesize, &asoc->rmem_alloc);
+ sctp_association_put(asoc);
}
/* Create and initialize an SCTP_ASSOC_CHANGE event.
@@ -922,7 +913,6 @@ done:
/* Free a ulpevent that has an owner. It includes releasing the reference
* to the owner, updating the rwnd in case of a DATA event and freeing the
* skb.
- * See comments in sctp_stub_rfree().
*/
void sctp_ulpevent_free(struct sctp_ulpevent *event)
{
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index f44f46f1d8e..8d782282ec1 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -638,7 +638,7 @@ gss_pipe_destroy_msg(struct rpc_pipe_msg *msg)
gss_msg);
atomic_inc(&gss_msg->count);
gss_unhash_msg(gss_msg);
- if (msg->errno == -ETIMEDOUT || msg->errno == -EPIPE) {
+ if (msg->errno == -ETIMEDOUT) {
unsigned long now = jiffies;
if (time_after(now, ratelimit)) {
printk(KERN_WARNING "RPC: AUTH_GSS upcall timed out.\n"
@@ -786,7 +786,9 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int taskflags)
cred->gc_flags = 0;
cred->gc_base.cr_ops = &gss_credops;
cred->gc_service = gss_auth->service;
- err = gss_create_upcall(gss_auth, cred);
+ do {
+ err = gss_create_upcall(gss_auth, cred);
+ } while (err == -EAGAIN);
if (err < 0)
goto out_err;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 81e00a6c19d..16a2458f38f 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -39,23 +39,26 @@ static kmem_cache_t *rpc_inode_cachep __read_mostly;
#define RPC_UPCALL_TIMEOUT (30*HZ)
static void
-__rpc_purge_upcall(struct inode *inode, int err)
+__rpc_purge_list(struct rpc_inode *rpci, struct list_head *head, int err)
{
- struct rpc_inode *rpci = RPC_I(inode);
struct rpc_pipe_msg *msg;
+ void (*destroy_msg)(struct rpc_pipe_msg *);
- while (!list_empty(&rpci->pipe)) {
- msg = list_entry(rpci->pipe.next, struct rpc_pipe_msg, list);
- list_del_init(&msg->list);
- msg->errno = err;
- rpci->ops->destroy_msg(msg);
- }
- while (!list_empty(&rpci->in_upcall)) {
- msg = list_entry(rpci->pipe.next, struct rpc_pipe_msg, list);
+ destroy_msg = rpci->ops->destroy_msg;
+ while (!list_empty(head)) {
+ msg = list_entry(head->next, struct rpc_pipe_msg, list);
list_del_init(&msg->list);
msg->errno = err;
- rpci->ops->destroy_msg(msg);
+ destroy_msg(msg);
}
+}
+
+static void
+__rpc_purge_upcall(struct inode *inode, int err)
+{
+ struct rpc_inode *rpci = RPC_I(inode);
+
+ __rpc_purge_list(rpci, &rpci->pipe, err);
rpci->pipelen = 0;
wake_up(&rpci->waitq);
}
@@ -115,6 +118,7 @@ rpc_close_pipes(struct inode *inode)
down(&inode->i_sem);
if (rpci->ops != NULL) {
rpci->nreaders = 0;
+ __rpc_purge_list(rpci, &rpci->in_upcall, -EPIPE);
__rpc_purge_upcall(inode, -EPIPE);
rpci->nwriters = 0;
if (rpci->ops->release_pipe)
@@ -170,7 +174,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp)
goto out;
msg = (struct rpc_pipe_msg *)filp->private_data;
if (msg != NULL) {
- msg->errno = -EPIPE;
+ msg->errno = -EAGAIN;
list_del_init(&msg->list);
rpci->ops->destroy_msg(msg);
}
@@ -179,7 +183,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp)
if (filp->f_mode & FMODE_READ)
rpci->nreaders --;
if (!rpci->nreaders)
- __rpc_purge_upcall(inode, -EPIPE);
+ __rpc_purge_upcall(inode, -EAGAIN);
if (rpci->ops->release_pipe)
rpci->ops->release_pipe(inode);
out:
diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c
index 8f97e90f36c..eb330d4f66d 100644
--- a/net/sunrpc/socklib.c
+++ b/net/sunrpc/socklib.c
@@ -6,6 +6,9 @@
* Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
*/
+#include <linux/compiler.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
#include <linux/types.h>
#include <linux/pagemap.h>
#include <linux/udp.h>
@@ -165,6 +168,8 @@ int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
return -1;
if ((unsigned short)csum_fold(desc.csum))
return -1;
+ if (unlikely(skb->ip_summed == CHECKSUM_HW))
+ netdev_rx_csum_fault(skb->dev);
return 0;
no_checksum:
if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits) < 0)
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index f16e7cdd615..c6a51911e71 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -623,12 +623,9 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
/* we can use it in-place */
rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr);
rqstp->rq_arg.head[0].iov_len = len;
- if (skb->ip_summed != CHECKSUM_UNNECESSARY) {
- if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) {
- skb_free_datagram(svsk->sk_sk, skb);
- return 0;
- }
- skb->ip_summed = CHECKSUM_UNNECESSARY;
+ if (skb_checksum_complete(skb)) {
+ skb_free_datagram(svsk->sk_sk, skb);
+ return 0;
}
rqstp->rq_skbuff = skb;
}
@@ -1181,6 +1178,7 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
arg->tail[0].iov_len = 0;
try_to_freeze();
+ cond_resched();
if (signalled())
return -EINTR;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 0a51fd46a84..77e8800d412 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -990,6 +990,7 @@ static void xs_udp_connect_worker(void *args)
sk->sk_data_ready = xs_udp_data_ready;
sk->sk_write_space = xs_udp_write_space;
sk->sk_no_check = UDP_CSUM_NORCV;
+ sk->sk_allocation = GFP_ATOMIC;
xprt_set_connected(xprt);
@@ -1074,6 +1075,7 @@ static void xs_tcp_connect_worker(void *args)
sk->sk_data_ready = xs_tcp_data_ready;
sk->sk_state_change = xs_tcp_state_change;
sk->sk_write_space = xs_tcp_write_space;
+ sk->sk_allocation = GFP_ATOMIC;
/* socket options */
sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 0db9e57013f..d19e274b9c4 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -346,6 +346,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
struct xfrm_policy *pol, **p;
struct xfrm_policy *delpol = NULL;
struct xfrm_policy **newpos = NULL;
+ struct dst_entry *gc_list;
write_lock_bh(&xfrm_policy_lock);
for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL;) {
@@ -381,9 +382,36 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
xfrm_pol_hold(policy);
write_unlock_bh(&xfrm_policy_lock);
- if (delpol) {
+ if (delpol)
xfrm_policy_kill(delpol);
+
+ read_lock_bh(&xfrm_policy_lock);
+ gc_list = NULL;
+ for (policy = policy->next; policy; policy = policy->next) {
+ struct dst_entry *dst;
+
+ write_lock(&policy->lock);
+ dst = policy->bundles;
+ if (dst) {
+ struct dst_entry *tail = dst;
+ while (tail->next)
+ tail = tail->next;
+ tail->next = gc_list;
+ gc_list = dst;
+
+ policy->bundles = NULL;
+ }
+ write_unlock(&policy->lock);
+ }
+ read_unlock_bh(&xfrm_policy_lock);
+
+ while (gc_list) {
+ struct dst_entry *dst = gc_list;
+
+ gc_list = dst->next;
+ dst_free(dst);
}
+
return 0;
}
EXPORT_SYMBOL(xfrm_policy_insert);
@@ -1014,13 +1042,12 @@ int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
}
EXPORT_SYMBOL(__xfrm_route_forward);
-/* Optimize later using cookies and generation ids. */
-
static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie)
{
- if (!stale_bundle(dst))
- return dst;
-
+ /* If it is marked obsolete, which is how we even get here,
+ * then we have purged it from the policy bundle list and we
+ * did that for a good reason.
+ */
return NULL;
}
@@ -1104,6 +1131,16 @@ int xfrm_flush_bundles(void)
return 0;
}
+static int always_true(struct dst_entry *dst)
+{
+ return 1;
+}
+
+void xfrm_flush_all_bundles(void)
+{
+ xfrm_prune_bundles(always_true);
+}
+
void xfrm_init_pmtu(struct dst_entry *dst)
{
do {
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 7cf48aa6c95..479effc9766 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -431,6 +431,8 @@ void xfrm_state_insert(struct xfrm_state *x)
spin_lock_bh(&xfrm_state_lock);
__xfrm_state_insert(x);
spin_unlock_bh(&xfrm_state_lock);
+
+ xfrm_flush_all_bundles();
}
EXPORT_SYMBOL(xfrm_state_insert);
@@ -478,6 +480,9 @@ out:
spin_unlock_bh(&xfrm_state_lock);
xfrm_state_put_afinfo(afinfo);
+ if (!err)
+ xfrm_flush_all_bundles();
+
if (x1) {
xfrm_state_delete(x1);
xfrm_state_put(x1);