diff options
Diffstat (limited to 'net')
122 files changed, 1728 insertions, 1156 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 91e412b0ab0..67465b65abe 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -753,6 +753,8 @@ static int vlan_ioctl_handler(void __user *arg) break; case GET_VLAN_REALDEV_NAME_CMD: err = vlan_dev_get_realdev_name(args.device1, args.u.device2); + if (err) + goto out; if (copy_to_user(arg, &args, sizeof(struct vlan_ioctl_args))) { err = -EFAULT; @@ -761,6 +763,8 @@ static int vlan_ioctl_handler(void __user *arg) case GET_VLAN_VID_CMD: err = vlan_dev_get_vid(args.device1, &vid); + if (err) + goto out; args.u.VID = vid; if (copy_to_user(arg, &args, sizeof(struct vlan_ioctl_args))) { @@ -774,7 +778,7 @@ static int vlan_ioctl_handler(void __user *arg) __FUNCTION__, args.cmd); return -EINVAL; }; - +out: return err; } diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index b7486488967..f2a8750bbf1 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -165,6 +165,9 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev, skb_pull(skb, VLAN_HLEN); /* take off the VLAN header (4 bytes currently) */ + /* Need to correct hardware checksum */ + skb_postpull_rcsum(skb, vhdr, VLAN_HLEN); + /* Ok, lets check to make sure the device (dev) we * came in on is what this VLAN is attached to. */ diff --git a/net/Makefile b/net/Makefile index 4aa2f46d2a5..f5141b9d4f3 100644 --- a/net/Makefile +++ b/net/Makefile @@ -15,8 +15,8 @@ obj-$(CONFIG_NET) += $(tmp-y) # LLC has to be linked before the files in net/802/ obj-$(CONFIG_LLC) += llc/ obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/ -obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_NETFILTER) += netfilter/ +obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_XFRM) += xfrm/ obj-$(CONFIG_UNIX) += unix/ ifneq ($(CONFIG_IPV6),) diff --git a/net/atm/atm_misc.c b/net/atm/atm_misc.c index 223c7ad5bd0..02cc7e71efe 100644 --- a/net/atm/atm_misc.c +++ b/net/atm/atm_misc.c @@ -74,11 +74,14 @@ struct sk_buff *atm_alloc_charge(struct atm_vcc *vcc,int pdu_size, */ -int atm_pcr_goal(struct atm_trafprm *tp) +int atm_pcr_goal(const struct atm_trafprm *tp) { - if (tp->pcr && tp->pcr != ATM_MAX_PCR) return -tp->pcr; - if (tp->min_pcr && !tp->pcr) return tp->min_pcr; - if (tp->max_pcr != ATM_MAX_PCR) return -tp->max_pcr; + if (tp->pcr && tp->pcr != ATM_MAX_PCR) + return -tp->pcr; + if (tp->min_pcr && !tp->pcr) + return tp->min_pcr; + if (tp->max_pcr != ATM_MAX_PCR) + return -tp->max_pcr; return 0; } diff --git a/net/atm/common.c b/net/atm/common.c index 63feea49fb1..6656b111cc0 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -221,6 +221,29 @@ void vcc_release_async(struct atm_vcc *vcc, int reply) EXPORT_SYMBOL(vcc_release_async); +void atm_dev_release_vccs(struct atm_dev *dev) +{ + int i; + + write_lock_irq(&vcc_sklist_lock); + for (i = 0; i < VCC_HTABLE_SIZE; i++) { + struct hlist_head *head = &vcc_hash[i]; + struct hlist_node *node, *tmp; + struct sock *s; + struct atm_vcc *vcc; + + sk_for_each_safe(s, node, tmp, head) { + vcc = atm_sk(s); + if (vcc->dev == dev) { + vcc_release_async(vcc, -EPIPE); + sk_del_node_init(s); + } + } + } + write_unlock_irq(&vcc_sklist_lock); +} + + static int adjust_tp(struct atm_trafprm *tp,unsigned char aal) { int max_sdu; @@ -332,12 +355,13 @@ static int __vcc_connect(struct atm_vcc *vcc, struct atm_dev *dev, short vpi, return -EINVAL; if (vci > 0 && vci < ATM_NOT_RSV_VCI && !capable(CAP_NET_BIND_SERVICE)) return -EPERM; - error = 0; + error = -ENODEV; if (!try_module_get(dev->ops->owner)) - return -ENODEV; + return error; vcc->dev = dev; write_lock_irq(&vcc_sklist_lock); - if ((error = find_ci(vcc, &vpi, &vci))) { + if (test_bit(ATM_DF_REMOVED, &dev->flags) || + (error = find_ci(vcc, &vpi, &vci))) { write_unlock_irq(&vcc_sklist_lock); goto fail_module_put; } @@ -423,33 +447,23 @@ int vcc_connect(struct socket *sock, int itf, short vpi, int vci) if (vcc->qos.txtp.traffic_class == ATM_ANYCLASS || vcc->qos.rxtp.traffic_class == ATM_ANYCLASS) return -EINVAL; - if (itf != ATM_ITF_ANY) { - dev = atm_dev_lookup(itf); - if (!dev) - return -ENODEV; - error = __vcc_connect(vcc, dev, vpi, vci); - if (error) { - atm_dev_put(dev); - return error; - } + if (likely(itf != ATM_ITF_ANY)) { + dev = try_then_request_module(atm_dev_lookup(itf), "atm-device-%d", itf); } else { - struct list_head *p, *next; - dev = NULL; - spin_lock(&atm_dev_lock); - list_for_each_safe(p, next, &atm_devs) { - dev = list_entry(p, struct atm_dev, dev_list); + down(&atm_dev_mutex); + if (!list_empty(&atm_devs)) { + dev = list_entry(atm_devs.next, struct atm_dev, dev_list); atm_dev_hold(dev); - spin_unlock(&atm_dev_lock); - if (!__vcc_connect(vcc, dev, vpi, vci)) - break; - atm_dev_put(dev); - dev = NULL; - spin_lock(&atm_dev_lock); } - spin_unlock(&atm_dev_lock); - if (!dev) - return -ENODEV; + up(&atm_dev_mutex); + } + if (!dev) + return -ENODEV; + error = __vcc_connect(vcc, dev, vpi, vci); + if (error) { + atm_dev_put(dev); + return error; } if (vpi == ATM_VPI_UNSPEC || vci == ATM_VCI_UNSPEC) set_bit(ATM_VF_PARTIAL,&vcc->flags); diff --git a/net/atm/common.h b/net/atm/common.h index e49ed41c0e3..4887c317cef 100644 --- a/net/atm/common.h +++ b/net/atm/common.h @@ -47,4 +47,6 @@ static inline void atm_proc_exit(void) /* SVC */ int svc_change_qos(struct atm_vcc *vcc,struct atm_qos *qos); +void atm_dev_release_vccs(struct atm_dev *dev); + #endif diff --git a/net/atm/resources.c b/net/atm/resources.c index 415d2615d47..c8c459fcb03 100644 --- a/net/atm/resources.c +++ b/net/atm/resources.c @@ -25,7 +25,7 @@ LIST_HEAD(atm_devs); -DEFINE_SPINLOCK(atm_dev_lock); +DECLARE_MUTEX(atm_dev_mutex); static struct atm_dev *__alloc_atm_dev(const char *type) { @@ -52,7 +52,7 @@ static struct atm_dev *__atm_dev_lookup(int number) list_for_each(p, &atm_devs) { dev = list_entry(p, struct atm_dev, dev_list); - if ((dev->ops) && (dev->number == number)) { + if (dev->number == number) { atm_dev_hold(dev); return dev; } @@ -64,12 +64,13 @@ struct atm_dev *atm_dev_lookup(int number) { struct atm_dev *dev; - spin_lock(&atm_dev_lock); + down(&atm_dev_mutex); dev = __atm_dev_lookup(number); - spin_unlock(&atm_dev_lock); + up(&atm_dev_mutex); return dev; } + struct atm_dev *atm_dev_register(const char *type, const struct atmdev_ops *ops, int number, unsigned long *flags) { @@ -81,11 +82,11 @@ struct atm_dev *atm_dev_register(const char *type, const struct atmdev_ops *ops, type); return NULL; } - spin_lock(&atm_dev_lock); + down(&atm_dev_mutex); if (number != -1) { if ((inuse = __atm_dev_lookup(number))) { atm_dev_put(inuse); - spin_unlock(&atm_dev_lock); + up(&atm_dev_mutex); kfree(dev); return NULL; } @@ -105,19 +106,17 @@ struct atm_dev *atm_dev_register(const char *type, const struct atmdev_ops *ops, memset(&dev->flags, 0, sizeof(dev->flags)); memset(&dev->stats, 0, sizeof(dev->stats)); atomic_set(&dev->refcnt, 1); - list_add_tail(&dev->dev_list, &atm_devs); - spin_unlock(&atm_dev_lock); if (atm_proc_dev_register(dev) < 0) { printk(KERN_ERR "atm_dev_register: " "atm_proc_dev_register failed for dev %s\n", type); - spin_lock(&atm_dev_lock); - list_del(&dev->dev_list); - spin_unlock(&atm_dev_lock); + up(&atm_dev_mutex); kfree(dev); return NULL; } + list_add_tail(&dev->dev_list, &atm_devs); + up(&atm_dev_mutex); return dev; } @@ -125,37 +124,22 @@ struct atm_dev *atm_dev_register(const char *type, const struct atmdev_ops *ops, void atm_dev_deregister(struct atm_dev *dev) { - unsigned long warning_time; + BUG_ON(test_bit(ATM_DF_REMOVED, &dev->flags)); + set_bit(ATM_DF_REMOVED, &dev->flags); + + /* + * if we remove current device from atm_devs list, new device + * with same number can appear, such we need deregister proc, + * release async all vccs and remove them from vccs list too + */ + down(&atm_dev_mutex); + list_del(&dev->dev_list); + up(&atm_dev_mutex); + atm_dev_release_vccs(dev); atm_proc_dev_deregister(dev); - spin_lock(&atm_dev_lock); - list_del(&dev->dev_list); - spin_unlock(&atm_dev_lock); - - warning_time = jiffies; - while (atomic_read(&dev->refcnt) != 1) { - msleep(250); - if ((jiffies - warning_time) > 10 * HZ) { - printk(KERN_EMERG "atm_dev_deregister: waiting for " - "dev %d to become free. Usage count = %d\n", - dev->number, atomic_read(&dev->refcnt)); - warning_time = jiffies; - } - } - - kfree(dev); -} - -void shutdown_atm_dev(struct atm_dev *dev) -{ - if (atomic_read(&dev->refcnt) > 1) { - set_bit(ATM_DF_CLOSE, &dev->flags); - return; - } - if (dev->ops->dev_close) - dev->ops->dev_close(dev); - atm_dev_deregister(dev); + atm_dev_put(dev); } @@ -211,16 +195,16 @@ int atm_dev_ioctl(unsigned int cmd, void __user *arg) return -EFAULT; if (get_user(len, &iobuf->length)) return -EFAULT; - spin_lock(&atm_dev_lock); + down(&atm_dev_mutex); list_for_each(p, &atm_devs) size += sizeof(int); if (size > len) { - spin_unlock(&atm_dev_lock); + up(&atm_dev_mutex); return -E2BIG; } tmp_buf = kmalloc(size, GFP_ATOMIC); if (!tmp_buf) { - spin_unlock(&atm_dev_lock); + up(&atm_dev_mutex); return -ENOMEM; } tmp_p = tmp_buf; @@ -228,7 +212,7 @@ int atm_dev_ioctl(unsigned int cmd, void __user *arg) dev = list_entry(p, struct atm_dev, dev_list); *tmp_p++ = dev->number; } - spin_unlock(&atm_dev_lock); + up(&atm_dev_mutex); error = ((copy_to_user(buf, tmp_buf, size)) || put_user(size, &iobuf->length)) ? -EFAULT : 0; @@ -245,7 +229,8 @@ int atm_dev_ioctl(unsigned int cmd, void __user *arg) if (get_user(number, &sioc->number)) return -EFAULT; - if (!(dev = atm_dev_lookup(number))) + if (!(dev = try_then_request_module(atm_dev_lookup(number), + "atm-device-%d", number))) return -ENODEV; switch (cmd) { @@ -414,13 +399,13 @@ static __inline__ void *dev_get_idx(loff_t left) void *atm_dev_seq_start(struct seq_file *seq, loff_t *pos) { - spin_lock(&atm_dev_lock); + down(&atm_dev_mutex); return *pos ? dev_get_idx(*pos) : (void *) 1; } void atm_dev_seq_stop(struct seq_file *seq, void *v) { - spin_unlock(&atm_dev_lock); + up(&atm_dev_mutex); } void *atm_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) @@ -434,4 +419,3 @@ void *atm_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) EXPORT_SYMBOL(atm_dev_register); EXPORT_SYMBOL(atm_dev_deregister); EXPORT_SYMBOL(atm_dev_lookup); -EXPORT_SYMBOL(shutdown_atm_dev); diff --git a/net/atm/resources.h b/net/atm/resources.h index 12910619dbb..b7fb82a93b4 100644 --- a/net/atm/resources.h +++ b/net/atm/resources.h @@ -11,8 +11,7 @@ extern struct list_head atm_devs; -extern spinlock_t atm_dev_lock; - +extern struct semaphore atm_dev_mutex; int atm_dev_ioctl(unsigned int cmd, void __user *arg); diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index defcf6a8607..975abe254b7 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -366,6 +366,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) spin_lock_bh(&br->lock); br_stp_recalculate_bridge_id(br); + br_features_recompute(br); if ((br->dev->flags & IFF_UP) && (dev->flags & IFF_UP) && netif_carrier_ok(dev)) br_stp_enable_port(p); diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index d8e36b77512..23422bd53a5 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -295,7 +295,7 @@ static int check_hbh_len(struct sk_buff *skb) len -= 2; while (len > 0) { - int optlen = raw[off+1]+2; + int optlen = skb->nh.raw[off+1]+2; switch (skb->nh.raw[off]) { case IPV6_TLV_PAD0: @@ -308,18 +308,15 @@ static int check_hbh_len(struct sk_buff *skb) case IPV6_TLV_JUMBO: if (skb->nh.raw[off+1] != 4 || (off&3) != 2) goto bad; - pkt_len = ntohl(*(u32*)(skb->nh.raw+off+2)); - + if (pkt_len <= IPV6_MAXPLEN || + skb->nh.ipv6h->payload_len) + goto bad; if (pkt_len > skb->len - sizeof(struct ipv6hdr)) goto bad; - if (pkt_len + sizeof(struct ipv6hdr) < skb->len) { - if (__pskb_trim(skb, - pkt_len + sizeof(struct ipv6hdr))) - goto bad; - if (skb->ip_summed == CHECKSUM_HW) - skb->ip_summed = CHECKSUM_NONE; - } + if (pskb_trim_rcsum(skb, + pkt_len+sizeof(struct ipv6hdr))) + goto bad; break; default: if (optlen > len) @@ -372,6 +369,7 @@ static unsigned int br_nf_pre_routing_ipv6(unsigned int hook, if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb)) goto inhdr_error; + nf_bridge_put(skb->nf_bridge); if ((nf_bridge = nf_bridge_alloc(skb)) == NULL) return NF_DROP; setup_pre_routing(skb); @@ -455,6 +453,7 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff **pskb, skb->ip_summed = CHECKSUM_NONE; } + nf_bridge_put(skb->nf_bridge); if ((nf_bridge = nf_bridge_alloc(skb)) == NULL) return NF_DROP; setup_pre_routing(skb); diff --git a/net/core/datagram.c b/net/core/datagram.c index d219435d086..1bcfef51ac5 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -350,6 +350,20 @@ fault: return -EFAULT; } +unsigned int __skb_checksum_complete(struct sk_buff *skb) +{ + unsigned int sum; + + sum = (u16)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_HW)) + netdev_rx_csum_fault(skb->dev); + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + return sum; +} +EXPORT_SYMBOL(__skb_checksum_complete); + /** * skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec. * @skb: skbuff @@ -363,7 +377,7 @@ fault: * -EFAULT - fault during copy. Beware, in this case iovec * can be modified! */ -int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb, +int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, int hlen, struct iovec *iov) { unsigned int csum; @@ -376,8 +390,7 @@ int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb, iov++; if (iov->iov_len < chunk) { - if ((unsigned short)csum_fold(skb_checksum(skb, 0, chunk + hlen, - skb->csum))) + if (__skb_checksum_complete(skb)) goto csum_error; if (skb_copy_datagram_iovec(skb, hlen, iov, chunk)) goto fault; @@ -388,6 +401,8 @@ int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb, goto fault; if ((unsigned short)csum_fold(csum)) goto csum_error; + if (unlikely(skb->ip_summed == CHECKSUM_HW)) + netdev_rx_csum_fault(skb->dev); iov->iov_len -= chunk; iov->iov_base += chunk; } diff --git a/net/core/dev.c b/net/core/dev.c index 8d154159527..a5efc9ae010 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1108,6 +1108,19 @@ out: return ret; } +/* Take action when hardware reception checksum errors are detected. */ +#ifdef CONFIG_BUG +void netdev_rx_csum_fault(struct net_device *dev) +{ + if (net_ratelimit()) { + printk(KERN_ERR "%s: hw csum failure.\n", + dev ? dev->name : "<unknown>"); + dump_stack(); + } +} +EXPORT_SYMBOL(netdev_rx_csum_fault); +#endif + #ifdef CONFIG_HIGHMEM /* Actually, we should eliminate this check as soon as we know, that: * 1. IOMMU is present and allows to map all the memory. diff --git a/net/core/filter.c b/net/core/filter.c index 079c2edff78..3a10e0bc90e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -116,8 +116,6 @@ int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) A /= X; continue; case BPF_ALU|BPF_DIV|BPF_K: - if (fentry->k == 0) - return 0; A /= fentry->k; continue; case BPF_ALU|BPF_AND|BPF_X: @@ -295,7 +293,7 @@ int sk_chk_filter(struct sock_filter *filter, int flen) struct sock_filter *ftest; int pc; - if (((unsigned int)flen >= (~0U / sizeof(struct sock_filter))) || flen == 0) + if (flen == 0 || flen > BPF_MAXINSNS) return -EINVAL; /* check the filter code now */ @@ -320,6 +318,10 @@ int sk_chk_filter(struct sock_filter *filter, int flen) } } + /* check for division by zero -Kris Katterjohn 2005-10-30 */ + if (ftest->code == (BPF_ALU|BPF_DIV|BPF_K) && ftest->k == 0) + return -EINVAL; + /* check that memory operations use valid addresses. */ if (ftest->k >= BPF_MEMWORDS) { /* but it might not be a memory operation... */ @@ -358,7 +360,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) int err; /* Make sure new filter is there and in the right amounts. */ - if (fprog->filter == NULL || fprog->len > BPF_MAXINSNS) + if (fprog->filter == NULL) return -EINVAL; fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 802fe11efad..49424a42a2c 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -101,16 +101,20 @@ void netpoll_queue(struct sk_buff *skb) static int checksum_udp(struct sk_buff *skb, struct udphdr *uh, unsigned short ulen, u32 saddr, u32 daddr) { - if (uh->check == 0) + unsigned int psum; + + if (uh->check == 0 || skb->ip_summed == CHECKSUM_UNNECESSARY) return 0; - if (skb->ip_summed == CHECKSUM_HW) - return csum_tcpudp_magic( - saddr, daddr, ulen, IPPROTO_UDP, skb->csum); + psum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); + + if (skb->ip_summed == CHECKSUM_HW && + !(u16)csum_fold(csum_add(psum, skb->csum))) + return 0; - skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); + skb->csum = psum; - return csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); + return __skb_checksum_complete(skb); } /* @@ -489,7 +493,7 @@ int __netpoll_rx(struct sk_buff *skb) if (ulen != len) goto out; - if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr) < 0) + if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr)) goto out; if (np->local_ip && np->local_ip != ntohl(iph->daddr)) goto out; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b7d13a4fff4..83fee37de38 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1725,7 +1725,7 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, * of the skb if any page alloc fails user this procedure returns -ENOMEM */ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, - int getfrag(void *from, char *to, int offset, + int (*getfrag)(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length) { diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index ca03521112c..656e13e38cf 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -1251,7 +1251,7 @@ static int dccp_v4_destroy_sock(struct sock *sk) struct dccp_sock *dp = dccp_sk(sk); /* - * DCCP doesn't use sk_qrite_queue, just sk_send_head + * DCCP doesn't use sk_write_queue, just sk_send_head * for retransmissions */ if (sk->sk_send_head != NULL) { diff --git a/net/dccp/proto.c b/net/dccp/proto.c index e0ace7cbb99..8a6b2a9e458 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -46,6 +46,7 @@ atomic_t dccp_orphan_count = ATOMIC_INIT(0); static struct net_protocol dccp_protocol = { .handler = dccp_v4_rcv, .err_handler = dccp_v4_err, + .no_policy = 1, }; const char *dccp_packet_name(const int type) diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 3f25cadccdd..d402e9020c6 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -153,6 +153,7 @@ static struct proto_ops dn_proto_ops; static DEFINE_RWLOCK(dn_hash_lock); static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE]; static struct hlist_head dn_wild_sk; +static atomic_t decnet_memory_allocated; static int __dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen, int flags); static int __dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen, int flags); @@ -446,10 +447,26 @@ static void dn_destruct(struct sock *sk) dst_release(xchg(&sk->sk_dst_cache, NULL)); } +static int dn_memory_pressure; + +static void dn_enter_memory_pressure(void) +{ + if (!dn_memory_pressure) { + dn_memory_pressure = 1; + } +} + static struct proto dn_proto = { - .name = "DECNET", - .owner = THIS_MODULE, - .obj_size = sizeof(struct dn_sock), + .name = "NSP", + .owner = THIS_MODULE, + .enter_memory_pressure = dn_enter_memory_pressure, + .memory_pressure = &dn_memory_pressure, + .memory_allocated = &decnet_memory_allocated, + .sysctl_mem = sysctl_decnet_mem, + .sysctl_wmem = sysctl_decnet_wmem, + .sysctl_rmem = sysctl_decnet_rmem, + .max_header = DN_MAX_NSP_DATA_HEADER + 64, + .obj_size = sizeof(struct dn_sock), }; static struct sock *dn_alloc_sock(struct socket *sock, gfp_t gfp) @@ -470,6 +487,8 @@ static struct sock *dn_alloc_sock(struct socket *sock, gfp_t gfp) sk->sk_family = PF_DECnet; sk->sk_protocol = 0; sk->sk_allocation = gfp; + sk->sk_sndbuf = sysctl_decnet_wmem[1]; + sk->sk_rcvbuf = sysctl_decnet_rmem[1]; /* Initialization of DECnet Session Control Port */ scp = DN_SK(sk); @@ -1664,17 +1683,15 @@ static int dn_recvmsg(struct kiocb *iocb, struct socket *sock, goto out; } - rv = dn_check_state(sk, NULL, 0, &timeo, flags); - if (rv) - goto out; - if (sk->sk_shutdown & RCV_SHUTDOWN) { - if (!(flags & MSG_NOSIGNAL)) - send_sig(SIGPIPE, current, 0); - rv = -EPIPE; + rv = 0; goto out; } + rv = dn_check_state(sk, NULL, 0, &timeo, flags); + if (rv) + goto out; + if (flags & ~(MSG_PEEK|MSG_OOB|MSG_WAITALL|MSG_DONTWAIT|MSG_NOSIGNAL)) { rv = -EOPNOTSUPP; goto out; @@ -1928,6 +1945,8 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock, if (sk->sk_shutdown & SEND_SHUTDOWN) { err = -EPIPE; + if (!(flags & MSG_NOSIGNAL)) + send_sig(SIGPIPE, current, 0); goto out_err; } diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c index 02bca49cb50..0e9d2c57116 100644 --- a/net/decnet/sysctl_net_decnet.c +++ b/net/decnet/sysctl_net_decnet.c @@ -10,6 +10,7 @@ * * Changes: * Steve Whitehouse - C99 changes and default device handling + * Steve Whitehouse - Memory buffer settings, like the tcp ones * */ #include <linux/config.h> @@ -37,6 +38,11 @@ int decnet_dr_count = 3; int decnet_log_martians = 1; int decnet_no_fc_max_cwnd = NSP_MIN_WINDOW; +/* Reasonable defaults, I hope, based on tcp's defaults */ +int sysctl_decnet_mem[3] = { 768 << 3, 1024 << 3, 1536 << 3 }; +int sysctl_decnet_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 }; +int sysctl_decnet_rmem[3] = { 4 * 1024, 87380, 87380 * 2 }; + #ifdef CONFIG_SYSCTL extern int decnet_dst_gc_interval; static int min_decnet_time_wait[] = { 5 }; @@ -428,6 +434,33 @@ static ctl_table dn_table[] = { .extra1 = &min_decnet_no_fc_max_cwnd, .extra2 = &max_decnet_no_fc_max_cwnd }, + { + .ctl_name = NET_DECNET_MEM, + .procname = "decnet_mem", + .data = &sysctl_decnet_mem, + .maxlen = sizeof(sysctl_decnet_mem), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = NET_DECNET_RMEM, + .procname = "decnet_rmem", + .data = &sysctl_decnet_rmem, + .maxlen = sizeof(sysctl_decnet_rmem), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = NET_DECNET_WMEM, + .procname = "decnet_wmem", + .data = &sysctl_decnet_wmem, + .maxlen = sizeof(sysctl_decnet_wmem), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, { .ctl_name = NET_DECNET_DEBUG_LEVEL, .procname = "debug", diff --git a/net/ieee80211/Kconfig b/net/ieee80211/Kconfig index 91b16fbf91f..d18ccba3ea9 100644 --- a/net/ieee80211/Kconfig +++ b/net/ieee80211/Kconfig @@ -55,7 +55,7 @@ config IEEE80211_CRYPT_CCMP config IEEE80211_CRYPT_TKIP tristate "IEEE 802.11i TKIP encryption" - depends on IEEE80211 + depends on IEEE80211 && NET_RADIO select CRYPTO select CRYPTO_MICHAEL_MIC ---help--- diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index eaa150c33b0..d368cf24900 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -228,13 +228,14 @@ static int inet_create(struct socket *sock, int protocol) unsigned char answer_flags; char answer_no_check; int try_loading_module = 0; - int err = -ESOCKTNOSUPPORT; + int err; sock->state = SS_UNCONNECTED; /* Look for the requested type/protocol pair. */ answer = NULL; lookup_protocol: + err = -ESOCKTNOSUPPORT; rcu_read_lock(); list_for_each_rcu(p, &inetsw[sock->type]) { answer = list_entry(p, struct inet_protosw, list); @@ -252,6 +253,7 @@ lookup_protocol: if (IPPROTO_IP == answer->protocol) break; } + err = -EPROTONOSUPPORT; answer = NULL; } @@ -280,9 +282,6 @@ lookup_protocol: err = -EPERM; if (answer->capability > 0 && !capable(answer->capability)) goto out_rcu_unlock; - err = -EPROTONOSUPPORT; - if (!protocol) - goto out_rcu_unlock; sock->ops = answer->ops; answer_prot = answer->prot; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 4ec4b2ca6ab..04a6fe3e95a 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -234,7 +234,10 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy) { struct in_ifaddr *promote = NULL; - struct in_ifaddr *ifa1 = *ifap; + struct in_ifaddr *ifa, *ifa1 = *ifap; + struct in_ifaddr *last_prim = in_dev->ifa_list; + struct in_ifaddr *prev_prom = NULL; + int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev); ASSERT_RTNL(); @@ -243,18 +246,22 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, **/ if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) { - struct in_ifaddr *ifa; struct in_ifaddr **ifap1 = &ifa1->ifa_next; while ((ifa = *ifap1) != NULL) { + if (!(ifa->ifa_flags & IFA_F_SECONDARY) && + ifa1->ifa_scope <= ifa->ifa_scope) + last_prim = ifa; + if (!(ifa->ifa_flags & IFA_F_SECONDARY) || ifa1->ifa_mask != ifa->ifa_mask || !inet_ifa_match(ifa1->ifa_address, ifa)) { ifap1 = &ifa->ifa_next; + prev_prom = ifa; continue; } - if (!IN_DEV_PROMOTE_SECONDARIES(in_dev)) { + if (!do_promote) { *ifap1 = ifa->ifa_next; rtmsg_ifa(RTM_DELADDR, ifa); @@ -283,18 +290,31 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, */ rtmsg_ifa(RTM_DELADDR, ifa1); notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); - if (destroy) { - inet_free_ifa(ifa1); - if (!in_dev->ifa_list) - inetdev_destroy(in_dev); - } + if (promote) { + + if (prev_prom) { + prev_prom->ifa_next = promote->ifa_next; + promote->ifa_next = last_prim->ifa_next; + last_prim->ifa_next = promote; + } - if (promote && IN_DEV_PROMOTE_SECONDARIES(in_dev)) { - /* not sure if we should send a delete notify first? */ promote->ifa_flags &= ~IFA_F_SECONDARY; rtmsg_ifa(RTM_NEWADDR, promote); notifier_call_chain(&inetaddr_chain, NETDEV_UP, promote); + for (ifa = promote->ifa_next; ifa; ifa = ifa->ifa_next) { + if (ifa1->ifa_mask != ifa->ifa_mask || + !inet_ifa_match(ifa1->ifa_address, ifa)) + continue; + fib_add_ifaddr(ifa); + } + + } + if (destroy) { + inet_free_ifa(ifa1); + + if (!in_dev->ifa_list) + inetdev_destroy(in_dev); } } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 2267c1fad87..19b1b984d68 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -407,7 +407,7 @@ static void fib_magic(int cmd, int type, u32 dst, int dst_len, struct in_ifaddr tb->tb_delete(tb, &req.rtm, &rta, &req.nlh, NULL); } -static void fib_add_ifaddr(struct in_ifaddr *ifa) +void fib_add_ifaddr(struct in_ifaddr *ifa) { struct in_device *in_dev = ifa->ifa_dev; struct net_device *dev = in_dev->dev; @@ -544,12 +544,16 @@ static void nl_fib_input(struct sock *sk, int len) struct sk_buff *skb = NULL; struct nlmsghdr *nlh = NULL; struct fib_result_nl *frn; - int err; u32 pid; struct fib_table *tb; - skb = skb_recv_datagram(sk, 0, 0, &err); + skb = skb_dequeue(&sk->sk_receive_queue); nlh = (struct nlmsghdr *)skb->data; + if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len || + nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn))) { + kfree_skb(skb); + return; + } frn = (struct fib_result_nl *) NLMSG_DATA(nlh); tb = fib_get_table(frn->tb_id_in); diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 2a8c9afc369..7ea0209cb16 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -975,7 +975,7 @@ static void fib_seq_stop(struct seq_file *seq, void *v) static unsigned fib_flag_trans(int type, u32 mask, struct fib_info *fi) { - static unsigned type2flags[RTN_MAX + 1] = { + static const unsigned type2flags[RTN_MAX + 1] = { [7] = RTF_REJECT, [8] = RTF_REJECT, }; unsigned flags = type2flags[type]; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 186f20c4a45..6d2a6ac070e 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -83,7 +83,7 @@ for (nhsel=0; nhsel < 1; nhsel++) #define endfor_nexthops(fi) } -static struct +static const struct { int error; u8 scope; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 66247f38b37..705e3ce86df 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2378,6 +2378,7 @@ static unsigned fib_flag_trans(int type, u32 mask, const struct fib_info *fi) */ static int fib_route_seq_show(struct seq_file *seq, void *v) { + const struct fib_trie_iter *iter = seq->private; struct leaf *l = v; int i; char bf[128]; @@ -2389,6 +2390,8 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) return 0; } + if (iter->trie == trie_local) + return 0; if (IS_TNODE(l)) return 0; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 175e093ec56..92e23b2ad4d 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -220,7 +220,7 @@ struct icmp_control { short error; /* This ICMP is classed as an error message */ }; -static struct icmp_control icmp_pointers[NR_ICMP_TYPES+1]; +static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1]; /* * The ICMP socket(s). This is the most convenient way to flow control @@ -934,11 +934,11 @@ int icmp_rcv(struct sk_buff *skb) case CHECKSUM_HW: if (!(u16)csum_fold(skb->csum)) break; - LIMIT_NETDEBUG(KERN_DEBUG "icmp v4 hw csum failure\n"); + /* fall through */ case CHECKSUM_NONE: - if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) + skb->csum = 0; + if (__skb_checksum_complete(skb)) goto error; - default:; } if (!pskb_pull(skb, sizeof(struct icmphdr))) @@ -994,7 +994,7 @@ error: /* * This table is the definition of how we handle ICMP. */ -static struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = { +static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = { [ICMP_ECHOREPLY] = { .output_entry = ICMP_MIB_OUTECHOREPS, .input_entry = ICMP_MIB_INECHOREPS, diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index c6247fc8406..4a195c724f0 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -872,11 +872,18 @@ int igmp_rcv(struct sk_buff *skb) return 0; } - if (!pskb_may_pull(skb, sizeof(struct igmphdr)) || - (u16)csum_fold(skb_checksum(skb, 0, len, 0))) { - in_dev_put(in_dev); - kfree_skb(skb); - return 0; + if (!pskb_may_pull(skb, sizeof(struct igmphdr))) + goto drop; + + switch (skb->ip_summed) { + case CHECKSUM_HW: + if (!(u16)csum_fold(skb->csum)) + break; + /* fall through */ + case CHECKSUM_NONE: + skb->csum = 0; + if (__skb_checksum_complete(skb)) + goto drop; } ih = skb->h.igmph; @@ -890,7 +897,10 @@ int igmp_rcv(struct sk_buff *skb) /* Is it our report looped back? */ if (((struct rtable*)skb->dst)->fl.iif == 0) break; - igmp_heard_report(in_dev, ih->group); + /* don't rely on MC router hearing unicast reports */ + if (skb->pkt_type == PACKET_MULTICAST || + skb->pkt_type == PACKET_BROADCAST) + igmp_heard_report(in_dev, ih->group); break; case IGMP_PIM: #ifdef CONFIG_IP_PIMSM_V1 @@ -906,6 +916,8 @@ int igmp_rcv(struct sk_buff *skb) default: NETDEBUG(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type); } + +drop: in_dev_put(in_dev); kfree_skb(skb); return 0; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index e7d26d9943c..8ce0ce2ee48 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -71,7 +71,7 @@ struct ipfrag_skb_cb /* Describe an entry in the "incomplete datagrams" queue. */ struct ipq { - struct ipq *next; /* linked list pointers */ + struct hlist_node list; struct list_head lru_list; /* lru list member */ u32 user; u32 saddr; @@ -89,7 +89,6 @@ struct ipq { spinlock_t lock; atomic_t refcnt; struct timer_list timer; /* when will this queue expire? */ - struct ipq **pprev; int iif; struct timeval stamp; }; @@ -99,7 +98,7 @@ struct ipq { #define IPQ_HASHSZ 64 /* Per-bucket lock is easy to add now. */ -static struct ipq *ipq_hash[IPQ_HASHSZ]; +static struct hlist_head ipq_hash[IPQ_HASHSZ]; static DEFINE_RWLOCK(ipfrag_lock); static u32 ipfrag_hash_rnd; static LIST_HEAD(ipq_lru_list); @@ -107,9 +106,7 @@ int ip_frag_nqueues = 0; static __inline__ void __ipq_unlink(struct ipq *qp) { - if(qp->next) - qp->next->pprev = qp->pprev; - *qp->pprev = qp->next; + hlist_del(&qp->list); list_del(&qp->lru_list); ip_frag_nqueues--; } @@ -139,27 +136,18 @@ static void ipfrag_secret_rebuild(unsigned long dummy) get_random_bytes(&ipfrag_hash_rnd, sizeof(u32)); for (i = 0; i < IPQ_HASHSZ; i++) { struct ipq *q; + struct hlist_node *p, *n; - q = ipq_hash[i]; - while (q) { - struct ipq *next = q->next; + hlist_for_each_entry_safe(q, p, n, &ipq_hash[i], list) { unsigned int hval = ipqhashfn(q->id, q->saddr, q->daddr, q->protocol); if (hval != i) { - /* Unlink. */ - if (q->next) - q->next->pprev = q->pprev; - *q->pprev = q->next; + hlist_del(&q->list); /* Relink to new hash chain. */ - if ((q->next = ipq_hash[hval]) != NULL) - q->next->pprev = &q->next; - ipq_hash[hval] = q; - q->pprev = &ipq_hash[hval]; + hlist_add_head(&q->list, &ipq_hash[hval]); } - - q = next; } } write_unlock(&ipfrag_lock); @@ -310,14 +298,16 @@ out: static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in) { struct ipq *qp; - +#ifdef CONFIG_SMP + struct hlist_node *n; +#endif write_lock(&ipfrag_lock); #ifdef CONFIG_SMP /* With SMP race we have to recheck hash table, because * such entry could be created on other cpu, while we * promoted read lock to write lock. */ - for(qp = ipq_hash[hash]; qp; qp = qp->next) { + hlist_for_each_entry(qp, n, &ipq_hash[hash], list) { if(qp->id == qp_in->id && qp->saddr == qp_in->saddr && qp->daddr == qp_in->daddr && @@ -337,10 +327,7 @@ static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in) atomic_inc(&qp->refcnt); atomic_inc(&qp->refcnt); - if((qp->next = ipq_hash[hash]) != NULL) - qp->next->pprev = &qp->next; - ipq_hash[hash] = qp; - qp->pprev = &ipq_hash[hash]; + hlist_add_head(&qp->list, &ipq_hash[hash]); INIT_LIST_HEAD(&qp->lru_list); list_add_tail(&qp->lru_list, &ipq_lru_list); ip_frag_nqueues++; @@ -392,9 +379,10 @@ static inline struct ipq *ip_find(struct iphdr *iph, u32 user) __u8 protocol = iph->protocol; unsigned int hash = ipqhashfn(id, saddr, daddr, protocol); struct ipq *qp; + struct hlist_node *n; read_lock(&ipfrag_lock); - for(qp = ipq_hash[hash]; qp; qp = qp->next) { + hlist_for_each_entry(qp, n, &ipq_hash[hash], list) { if(qp->id == id && qp->saddr == saddr && qp->daddr == daddr && diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 896ce3f8f53..46f9d9cf7a5 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -577,15 +577,16 @@ static int ipgre_rcv(struct sk_buff *skb) goto drop_nolock; if (flags&GRE_CSUM) { - if (skb->ip_summed == CHECKSUM_HW) { + switch (skb->ip_summed) { + case CHECKSUM_HW: csum = (u16)csum_fold(skb->csum); - if (csum) - skb->ip_summed = CHECKSUM_NONE; - } - if (skb->ip_summed == CHECKSUM_NONE) { - skb->csum = skb_checksum(skb, 0, skb->len, 0); + if (!csum) + break; + /* fall through */ + case CHECKSUM_NONE: + skb->csum = 0; + csum = __skb_checksum_complete(skb); skb->ip_summed = CHECKSUM_HW; - csum = (u16)csum_fold(skb->csum); } offset += 4; } @@ -617,7 +618,7 @@ static int ipgre_rcv(struct sk_buff *skb) skb->mac.raw = skb->nh.raw; skb->nh.raw = __pskb_pull(skb, offset); - skb_postpull_rcsum(skb, skb->mac.raw, offset); + skb_postpull_rcsum(skb, skb->h.raw, offset); memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); skb->pkt_type = PACKET_HOST; #ifdef CONFIG_NET_IPGRE_BROADCAST @@ -1216,7 +1217,7 @@ static int ipgre_tunnel_init(struct net_device *dev) return 0; } -int __init ipgre_fb_tunnel_init(struct net_device *dev) +static int __init ipgre_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv; struct iphdr *iph = &tunnel->parms.iph; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 11c2f68254f..eba64e2bd39 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -690,7 +690,7 @@ csum_page(struct page *page, int offset, int copy) return csum; } -inline int ip_ufo_append_data(struct sock *sk, +static inline int ip_ufo_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int hh_len, int fragheaderlen, diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index f828fa2eb7d..2a3a8c59c65 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -771,7 +771,7 @@ static inline int todrop_entry(struct ip_vs_conn *cp) * The drop rate array needs tuning for real environments. * Called from timer bh only => no locking */ - static char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; + static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; static char todrop_counter[9] = {0}; int i; diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 2d66848e7aa..9bdcf31b760 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -1909,7 +1909,7 @@ static int ip_vs_set_timeout(struct ip_vs_timeout_user *u) #define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user)) #define MAX_ARG_LEN SVCDEST_ARG_LEN -static unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = { +static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = { [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN, [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN, [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN, @@ -2180,7 +2180,7 @@ __ip_vs_get_timeouts(struct ip_vs_timeout_user *u) #define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) #define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2) -static unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = { +static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = { [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64, [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN, [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN, diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index c19408973c0..0e878fd6215 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -251,7 +251,7 @@ tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) #define TCP_DIR_OUTPUT 4 #define TCP_DIR_INPUT_ONLY 8 -static int tcp_state_off[IP_VS_DIR_LAST] = { +static const int tcp_state_off[IP_VS_DIR_LAST] = { [IP_VS_DIR_INPUT] = TCP_DIR_INPUT, [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT, [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY, diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 9d3c8b5f327..88a60650e6b 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -56,8 +56,8 @@ config IP_NF_CONNTRACK_MARK instead of the individual packets. config IP_NF_CONNTRACK_EVENTS - bool "Connection tracking events" - depends on IP_NF_CONNTRACK + bool "Connection tracking events (EXPERIMENTAL)" + depends on EXPERIMENTAL && IP_NF_CONNTRACK help If this option is enabled, the connection tracking code will provide a notifier chain that can be used by other kernel code @@ -66,8 +66,8 @@ config IP_NF_CONNTRACK_EVENTS IF unsure, say `N'. config IP_NF_CONNTRACK_NETLINK - tristate 'Connection tracking netlink interface' - depends on IP_NF_CONNTRACK && NETFILTER_NETLINK + tristate 'Connection tracking netlink interface (EXPERIMENTAL)' + depends on EXPERIMENTAL && IP_NF_CONNTRACK && NETFILTER_NETLINK depends on IP_NF_CONNTRACK!=y || NETFILTER_NETLINK!=m help This option enables support for a netlink-based userspace interface @@ -440,7 +440,7 @@ config IP_NF_MATCH_COMMENT config IP_NF_MATCH_CONNMARK tristate 'Connection mark match support' depends on IP_NF_IPTABLES - depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) + depends on (IP_NF_CONNTRACK && IP_NF_CONNTRACK_MARK) || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) help This option adds a `connmark' match, which allows you to match the connection mark value previously set for the session by `CONNMARK'. @@ -452,7 +452,7 @@ config IP_NF_MATCH_CONNMARK config IP_NF_MATCH_CONNBYTES tristate 'Connection byte/packet counter match support' depends on IP_NF_IPTABLES - depends on IP_NF_CT_ACCT || (NF_CT_ACCT && NF_CONNTRACK_IPV4) + depends on (IP_NF_CONNTRACK && IP_NF_CT_ACCT) || (NF_CT_ACCT && NF_CONNTRACK_IPV4) help This option adds a `connbytes' match, which allows you to match the number of bytes and/or packets for each direction within a connection. @@ -767,7 +767,7 @@ config IP_NF_TARGET_TTL config IP_NF_TARGET_CONNMARK tristate 'CONNMARK target support' depends on IP_NF_MANGLE - depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) + depends on (IP_NF_CONNTRACK && IP_NF_CONNTRACK_MARK) || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) help This option adds a `CONNMARK' target, which allows one to manipulate the connection mark value. Similar to the MARK target, but @@ -779,8 +779,8 @@ config IP_NF_TARGET_CONNMARK config IP_NF_TARGET_CLUSTERIP tristate "CLUSTERIP target support (EXPERIMENTAL)" - depends on IP_NF_IPTABLES && EXPERIMENTAL - depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) + depends on IP_NF_MANGLE && EXPERIMENTAL + depends on (IP_NF_CONNTRACK && IP_NF_CONNTRACK_MARK) || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) help The CLUSTERIP target allows you to build load-balancing clusters of network servers without having a dedicated load-balancing diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 058c48e258f..d0a447e520a 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -12,6 +12,7 @@ ip_nat_pptp-objs := ip_nat_helper_pptp.o ip_nat_proto_gre.o # connection tracking obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o +obj-$(CONFIG_IP_NF_NAT) += ip_nat.o # conntrack netlink interface obj-$(CONFIG_IP_NF_CONNTRACK_NETLINK) += ip_conntrack_netlink.o @@ -41,7 +42,7 @@ obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o # the three instances of ip_tables obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o -obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o ip_nat.o +obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o # matches diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c index fa3f914117e..e52847fa10f 100644 --- a/net/ipv4/netfilter/ip_conntrack_amanda.c +++ b/net/ipv4/netfilter/ip_conntrack_amanda.c @@ -37,7 +37,7 @@ MODULE_LICENSE("GPL"); module_param(master_timeout, int, 0600); MODULE_PARM_DESC(master_timeout, "timeout for the master connection"); -static char *conns[] = { "DATA ", "MESG ", "INDEX " }; +static const char *conns[] = { "DATA ", "MESG ", "INDEX " }; /* This is slow, but it's simple. --RR */ static char *amanda_buffer; diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 422ab68ee7f..84c66dbfeda 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -1345,6 +1345,11 @@ static int kill_all(struct ip_conntrack *i, void *data) return 1; } +void ip_conntrack_flush(void) +{ + ip_ct_iterate_cleanup(kill_all, NULL); +} + static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size) { if (vmalloced) @@ -1354,8 +1359,12 @@ static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size) get_order(sizeof(struct list_head) * size)); } -void ip_conntrack_flush() +/* Mishearing the voices in his head, our hero wonders how he's + supposed to kill the mall. */ +void ip_conntrack_cleanup(void) { + ip_ct_attach = NULL; + /* This makes sure all current packets have passed through netfilter framework. Roll on, two-stage module delete... */ @@ -1363,7 +1372,7 @@ void ip_conntrack_flush() ip_ct_event_cache_flush(); i_see_dead_people: - ip_ct_iterate_cleanup(kill_all, NULL); + ip_conntrack_flush(); if (atomic_read(&ip_conntrack_count) != 0) { schedule(); goto i_see_dead_people; @@ -1371,14 +1380,7 @@ void ip_conntrack_flush() /* wait until all references to ip_conntrack_untracked are dropped */ while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1) schedule(); -} -/* Mishearing the voices in his head, our hero wonders how he's - supposed to kill the mall. */ -void ip_conntrack_cleanup(void) -{ - ip_ct_attach = NULL; - ip_conntrack_flush(); kmem_cache_destroy(ip_conntrack_cachep); kmem_cache_destroy(ip_conntrack_expect_cachep); free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc, @@ -1408,7 +1410,7 @@ static struct list_head *alloc_hashtable(int size, int *vmalloced) return hash; } -int set_hashsize(const char *val, struct kernel_param *kp) +static int set_hashsize(const char *val, struct kernel_param *kp) { int i, bucket, hashsize, vmalloced; int old_vmalloced, old_size; diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c index d77d6b3f5f8..68b173bcda6 100644 --- a/net/ipv4/netfilter/ip_conntrack_ftp.c +++ b/net/ipv4/netfilter/ip_conntrack_ftp.c @@ -29,9 +29,9 @@ static char *ftp_buffer; static DEFINE_SPINLOCK(ip_ftp_lock); #define MAX_PORTS 8 -static short ports[MAX_PORTS]; +static unsigned short ports[MAX_PORTS]; static int ports_c; -module_param_array(ports, short, &ports_c, 0400); +module_param_array(ports, ushort, &ports_c, 0400); static int loose; module_param(loose, int, 0600); @@ -55,7 +55,7 @@ static int try_rfc959(const char *, size_t, u_int32_t [], char); static int try_eprt(const char *, size_t, u_int32_t [], char); static int try_epsv_response(const char *, size_t, u_int32_t [], char); -static struct ftp_search { +static const struct ftp_search { enum ip_conntrack_dir dir; const char *pattern; size_t plen; diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c index 15457415a4f..d7c40421d0d 100644 --- a/net/ipv4/netfilter/ip_conntrack_irc.c +++ b/net/ipv4/netfilter/ip_conntrack_irc.c @@ -34,7 +34,7 @@ #include <linux/moduleparam.h> #define MAX_PORTS 8 -static short ports[MAX_PORTS]; +static unsigned short ports[MAX_PORTS]; static int ports_c; static int max_dcc_channels = 8; static unsigned int dcc_timeout = 300; @@ -52,14 +52,14 @@ EXPORT_SYMBOL_GPL(ip_nat_irc_hook); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("IRC (DCC) connection tracking helper"); MODULE_LICENSE("GPL"); -module_param_array(ports, short, &ports_c, 0400); +module_param_array(ports, ushort, &ports_c, 0400); MODULE_PARM_DESC(ports, "port numbers of IRC servers"); module_param(max_dcc_channels, int, 0400); MODULE_PARM_DESC(max_dcc_channels, "max number of expected DCC channels per IRC session"); module_param(dcc_timeout, int, 0400); MODULE_PARM_DESC(dcc_timeout, "timeout on for unestablished DCC channels"); -static char *dccprotos[] = { "SEND ", "CHAT ", "MOVE ", "TSEND ", "SCHAT " }; +static const char *dccprotos[] = { "SEND ", "CHAT ", "MOVE ", "TSEND ", "SCHAT " }; #define MINMATCHLEN 5 #if 0 diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index d2a4fec2286..91fe8f2e38f 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -27,6 +27,7 @@ #include <linux/errno.h> #include <linux/netlink.h> #include <linux/spinlock.h> +#include <linux/interrupt.h> #include <linux/notifier.h> #include <linux/netfilter.h> @@ -59,11 +60,13 @@ ctnetlink_dump_tuples_proto(struct sk_buff *skb, NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum); + /* If no protocol helper is found, this function will return the + * generic protocol helper, so proto won't *ever* be NULL */ proto = ip_conntrack_proto_find_get(tuple->dst.protonum); - if (likely(proto && proto->tuple_to_nfattr)) { + if (likely(proto->tuple_to_nfattr)) ret = proto->tuple_to_nfattr(skb, tuple); - ip_conntrack_proto_put(proto); - } + + ip_conntrack_proto_put(proto); return ret; @@ -128,9 +131,11 @@ ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct ip_conntrack *ct) struct nfattr *nest_proto; int ret; - - if (!proto || !proto->to_nfattr) + + if (!proto->to_nfattr) { + ip_conntrack_proto_put(proto); return 0; + } nest_proto = NFA_NEST(skb, CTA_PROTOINFO); @@ -467,7 +472,7 @@ out: } #endif -static const int cta_min_ip[CTA_IP_MAX] = { +static const size_t cta_min_ip[CTA_IP_MAX] = { [CTA_IP_V4_SRC-1] = sizeof(u_int32_t), [CTA_IP_V4_DST-1] = sizeof(u_int32_t), }; @@ -497,8 +502,8 @@ ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple) return 0; } -static const int cta_min_proto[CTA_PROTO_MAX] = { - [CTA_PROTO_NUM-1] = sizeof(u_int16_t), +static const size_t cta_min_proto[CTA_PROTO_MAX] = { + [CTA_PROTO_NUM-1] = sizeof(u_int8_t), [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t), [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t), [CTA_PROTO_ICMP_TYPE-1] = sizeof(u_int8_t), @@ -523,14 +528,14 @@ ctnetlink_parse_tuple_proto(struct nfattr *attr, if (!tb[CTA_PROTO_NUM-1]) return -EINVAL; - tuple->dst.protonum = *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]); + tuple->dst.protonum = *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]); proto = ip_conntrack_proto_find_get(tuple->dst.protonum); - if (likely(proto && proto->nfattr_to_tuple)) { + if (likely(proto->nfattr_to_tuple)) ret = proto->nfattr_to_tuple(tb, tuple); - ip_conntrack_proto_put(proto); - } + + ip_conntrack_proto_put(proto); return ret; } @@ -576,7 +581,7 @@ ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple, } #ifdef CONFIG_IP_NF_NAT_NEEDED -static const int cta_min_protonat[CTA_PROTONAT_MAX] = { +static const size_t cta_min_protonat[CTA_PROTONAT_MAX] = { [CTA_PROTONAT_PORT_MIN-1] = sizeof(u_int16_t), [CTA_PROTONAT_PORT_MAX-1] = sizeof(u_int16_t), }; @@ -596,8 +601,6 @@ static int ctnetlink_parse_nat_proto(struct nfattr *attr, return -EINVAL; npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); - if (!npt) - return 0; if (!npt->nfattr_to_range) { ip_nat_proto_put(npt); @@ -614,6 +617,11 @@ static int ctnetlink_parse_nat_proto(struct nfattr *attr, return 0; } +static const size_t cta_min_nat[CTA_NAT_MAX] = { + [CTA_NAT_MINIP-1] = sizeof(u_int32_t), + [CTA_NAT_MAXIP-1] = sizeof(u_int32_t), +}; + static inline int ctnetlink_parse_nat(struct nfattr *cda[], const struct ip_conntrack *ct, struct ip_nat_range *range) @@ -627,6 +635,9 @@ ctnetlink_parse_nat(struct nfattr *cda[], nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]); + if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat)) + return -EINVAL; + if (tb[CTA_NAT_MINIP-1]) range->min_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MINIP-1]); @@ -667,6 +678,14 @@ ctnetlink_parse_help(struct nfattr *attr, char **helper_name) return 0; } +static const size_t cta_min[CTA_MAX] = { + [CTA_STATUS-1] = sizeof(u_int32_t), + [CTA_TIMEOUT-1] = sizeof(u_int32_t), + [CTA_MARK-1] = sizeof(u_int32_t), + [CTA_USE-1] = sizeof(u_int32_t), + [CTA_ID-1] = sizeof(u_int32_t) +}; + static int ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) @@ -678,6 +697,9 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, DEBUGP("entered %s\n", __FUNCTION__); + if (nfattr_bad_size(cda, CTA_MAX, cta_min)) + return -EINVAL; + if (cda[CTA_TUPLE_ORIG-1]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); else if (cda[CTA_TUPLE_REPLY-1]) @@ -706,11 +728,9 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, return -ENOENT; } } - if (del_timer(&ct->timeout)) { - ip_conntrack_put(ct); + if (del_timer(&ct->timeout)) ct->timeout.function((unsigned long)ct); - return 0; - } + ip_conntrack_put(ct); DEBUGP("leaving\n"); @@ -760,6 +780,9 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, return 0; } + if (nfattr_bad_size(cda, CTA_MAX, cta_min)) + return -EINVAL; + if (cda[CTA_TUPLE_ORIG-1]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); else if (cda[CTA_TUPLE_REPLY-1]) @@ -852,7 +875,7 @@ ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[]) DEBUGP("NAT status: %lu\n", status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); - if (ip_nat_initialized(ct, hooknum)) + if (ip_nat_initialized(ct, HOOK2MANIP(hooknum))) return -EEXIST; ip_nat_setup_info(ct, &range, hooknum); @@ -935,8 +958,6 @@ ctnetlink_change_protoinfo(struct ip_conntrack *ct, struct nfattr *cda[]) nfattr_parse_nested(tb, CTA_PROTOINFO_MAX, attr); proto = ip_conntrack_proto_find_get(npt); - if (!proto) - return -EINVAL; if (proto->from_nfattr) err = proto->from_nfattr(tb, ct); @@ -1047,6 +1068,9 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, DEBUGP("entered %s\n", __FUNCTION__); + if (nfattr_bad_size(cda, CTA_MAX, cta_min)) + return -EINVAL; + if (cda[CTA_TUPLE_ORIG-1]) { err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG); if (err < 0) @@ -1252,6 +1276,11 @@ out: return skb->len; } +static const size_t cta_min_exp[CTA_EXPECT_MAX] = { + [CTA_EXPECT_TIMEOUT-1] = sizeof(u_int32_t), + [CTA_EXPECT_ID-1] = sizeof(u_int32_t) +}; + static int ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) @@ -1263,6 +1292,9 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, DEBUGP("entered %s\n", __FUNCTION__); + if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) + return -EINVAL; + if (nlh->nlmsg_flags & NLM_F_DUMP) { struct nfgenmsg *msg = NLMSG_DATA(nlh); u32 rlen; @@ -1333,6 +1365,9 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, struct ip_conntrack_helper *h; int err; + if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) + return -EINVAL; + if (cda[CTA_EXPECT_TUPLE-1]) { /* delete a single expect by tuple */ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); @@ -1462,6 +1497,9 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, DEBUGP("entered %s\n", __FUNCTION__); + if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) + return -EINVAL; + if (!cda[CTA_EXPECT_TUPLE-1] || !cda[CTA_EXPECT_MASK-1] || !cda[CTA_EXPECT_MASTER-1]) @@ -1504,29 +1542,22 @@ static struct notifier_block ctnl_notifier_exp = { static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = { [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack, - .attr_count = CTA_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_MAX, }, [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack, - .attr_count = CTA_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_MAX, }, [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack, - .attr_count = CTA_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_MAX, }, [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack, - .attr_count = CTA_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_MAX, }, }; static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = { [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect, - .attr_count = CTA_EXPECT_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_EXPECT_MAX, }, [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect, - .attr_count = CTA_EXPECT_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_EXPECT_MAX, }, [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect, - .attr_count = CTA_EXPECT_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_EXPECT_MAX, }, }; static struct nfnetlink_subsystem ctnl_subsys = { diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index 5198f3a1e2c..5f9925db608 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -13,6 +13,7 @@ #include <linux/in.h> #include <linux/icmp.h> #include <linux/seq_file.h> +#include <linux/skbuff.h> #include <net/ip.h> #include <net/checksum.h> #include <linux/netfilter.h> @@ -50,7 +51,7 @@ static int icmp_invert_tuple(struct ip_conntrack_tuple *tuple, const struct ip_conntrack_tuple *orig) { /* Add 1; spaces filled with 0. */ - static u_int8_t invmap[] + static const u_int8_t invmap[] = { [ICMP_ECHO] = ICMP_ECHOREPLY + 1, [ICMP_ECHOREPLY] = ICMP_ECHO + 1, [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, @@ -109,7 +110,7 @@ static int icmp_packet(struct ip_conntrack *ct, return NF_ACCEPT; } -static u_int8_t valid_new[] = { +static const u_int8_t valid_new[] = { [ICMP_ECHO] = 1, [ICMP_TIMESTAMP] = 1, [ICMP_INFO_REQUEST] = 1, @@ -230,19 +231,15 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, case CHECKSUM_HW: if (!(u16)csum_fold(skb->csum)) break; - if (LOG_INVALID(IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "ip_ct_icmp: bad HW ICMP checksum "); - return -NF_ACCEPT; + /* fall through */ case CHECKSUM_NONE: - if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) { + skb->csum = 0; + if (__skb_checksum_complete(skb)) { if (LOG_INVALID(IPPROTO_ICMP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_icmp: bad ICMP checksum "); return -NF_ACCEPT; } - default: - break; } checksum_skipped: diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c index 59a4a0111dd..977fb59d456 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c @@ -65,7 +65,7 @@ static unsigned long ip_ct_sctp_timeout_shutdown_sent = 300 SECS / 1000; static unsigned long ip_ct_sctp_timeout_shutdown_recd = 300 SECS / 1000; static unsigned long ip_ct_sctp_timeout_shutdown_ack_sent = 3 SECS; -static unsigned long * sctp_timeouts[] +static const unsigned long * sctp_timeouts[] = { NULL, /* SCTP_CONNTRACK_NONE */ &ip_ct_sctp_timeout_closed, /* SCTP_CONNTRACK_CLOSED */ &ip_ct_sctp_timeout_cookie_wait, /* SCTP_CONNTRACK_COOKIE_WAIT */ @@ -118,7 +118,7 @@ cookie echoed to closed. */ /* SCTP conntrack state transitions */ -static enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = { +static const enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = { { /* ORIGINAL */ /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index 468c6003b4c..e7fa29e576d 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -99,7 +99,7 @@ unsigned long ip_ct_tcp_timeout_close = 10 SECS; to ~13-30min depending on RTO. */ unsigned long ip_ct_tcp_timeout_max_retrans = 5 MINS; -static unsigned long * tcp_timeouts[] +static const unsigned long * tcp_timeouts[] = { NULL, /* TCP_CONNTRACK_NONE */ &ip_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */ &ip_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */ @@ -170,7 +170,7 @@ enum tcp_bit_set { * if they are invalid * or we do not support the request (simultaneous open) */ -static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { +static const enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { { /* ORIGINAL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ @@ -272,9 +272,9 @@ static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sCL -> sCL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*ack*/ { sIV, sIV, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV }, +/*ack*/ { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV }, /* - * sSS -> sIV Might be a half-open connection. + * sSS -> sIG Might be a half-open connection. * sSR -> sSR Might answer late resent SYN. * sES -> sES :-) * sFW -> sCW Normal close request answered by ACK. @@ -341,9 +341,10 @@ static int tcp_print_conntrack(struct seq_file *s, static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa, const struct ip_conntrack *ct) { - struct nfattr *nest_parms = NFA_NEST(skb, CTA_PROTOINFO_TCP); + struct nfattr *nest_parms; read_lock_bh(&tcp_lock); + nest_parms = NFA_NEST(skb, CTA_PROTOINFO_TCP); NFA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t), &ct->proto.tcp.state); read_unlock_bh(&tcp_lock); @@ -357,6 +358,10 @@ nfattr_failure: return -1; } +static const size_t cta_min_tcp[CTA_PROTOINFO_TCP_MAX] = { + [CTA_PROTOINFO_TCP_STATE-1] = sizeof(u_int8_t), +}; + static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct) { struct nfattr *attr = cda[CTA_PROTOINFO_TCP-1]; @@ -369,6 +374,9 @@ static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct) nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr); + if (nfattr_bad_size(tb, CTA_PROTOINFO_TCP_MAX, cta_min_tcp)) + return -EINVAL; + if (!tb[CTA_PROTOINFO_TCP_STATE-1]) return -EINVAL; @@ -810,10 +818,11 @@ void ip_conntrack_tcp_update(struct sk_buff *skb, #define TH_CWR 0x80 /* table of valid flag combinations - ECE and CWR are always valid */ -static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] = +static const u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] = { [TH_SYN] = 1, [TH_SYN|TH_ACK] = 1, + [TH_SYN|TH_PUSH] = 1, [TH_SYN|TH_ACK|TH_PUSH] = 1, [TH_RST] = 1, [TH_RST|TH_ACK] = 1, @@ -909,8 +918,12 @@ static int tcp_packet(struct ip_conntrack *conntrack, switch (new_state) { case TCP_CONNTRACK_IGNORE: - /* Either SYN in ORIGINAL - * or SYN/ACK in REPLY. */ + /* Ignored packets: + * + * a) SYN in ORIGINAL + * b) SYN/ACK in REPLY + * c) ACK in reply direction after initial SYN in original. + */ if (index == TCP_SYNACK_SET && conntrack->proto.tcp.last_index == TCP_SYN_SET && conntrack->proto.tcp.last_dir != dir @@ -977,13 +990,20 @@ static int tcp_packet(struct ip_conntrack *conntrack, } case TCP_CONNTRACK_CLOSE: if (index == TCP_RST_SET - && test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status) - && conntrack->proto.tcp.last_index == TCP_SYN_SET + && ((test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status) + && conntrack->proto.tcp.last_index == TCP_SYN_SET) + || (!test_bit(IPS_ASSURED_BIT, &conntrack->status) + && conntrack->proto.tcp.last_index == TCP_ACK_SET)) && ntohl(th->ack_seq) == conntrack->proto.tcp.last_end) { - /* RST sent to invalid SYN we had let trough - * SYN was in window then, tear down connection. + /* RST sent to invalid SYN or ACK we had let trough + * at a) and c) above: + * + * a) SYN was in window then + * c) we hold a half-open connection. + * + * Delete our connection entry. * We skip window checking, because packet might ACK - * segments we ignored in the SYN. */ + * segments we ignored. */ goto in_window; } /* Just fall trough */ diff --git a/net/ipv4/netfilter/ip_conntrack_tftp.c b/net/ipv4/netfilter/ip_conntrack_tftp.c index a78736b8525..d3c5a371f99 100644 --- a/net/ipv4/netfilter/ip_conntrack_tftp.c +++ b/net/ipv4/netfilter/ip_conntrack_tftp.c @@ -26,9 +26,9 @@ MODULE_DESCRIPTION("tftp connection tracking helper"); MODULE_LICENSE("GPL"); #define MAX_PORTS 8 -static short ports[MAX_PORTS]; +static unsigned short ports[MAX_PORTS]; static int ports_c; -module_param_array(ports, short, &ports_c, 0400); +module_param_array(ports, ushort, &ports_c, 0400); MODULE_PARM_DESC(ports, "port numbers of tftp servers"); #if 0 diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c index 762f4d93936..c1a61462507 100644 --- a/net/ipv4/netfilter/ip_nat_core.c +++ b/net/ipv4/netfilter/ip_nat_core.c @@ -49,7 +49,7 @@ static unsigned int ip_nat_htable_size; static struct list_head *bysource; #define MAX_IP_NAT_PROTO 256 -struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO]; +static struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO]; static inline struct ip_nat_protocol * __ip_nat_proto_find(u_int8_t protonum) diff --git a/net/ipv4/netfilter/ip_nat_tftp.c b/net/ipv4/netfilter/ip_nat_tftp.c index 2215317c76b..43c3bd7c118 100644 --- a/net/ipv4/netfilter/ip_nat_tftp.c +++ b/net/ipv4/netfilter/ip_nat_tftp.c @@ -42,7 +42,10 @@ static unsigned int help(struct sk_buff **pskb, enum ip_conntrack_info ctinfo, struct ip_conntrack_expect *exp) { - exp->saved_proto.udp.port = exp->tuple.dst.u.tcp.port; + struct ip_conntrack *ct = exp->master; + + exp->saved_proto.udp.port + = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port; exp->dir = IP_CT_DIR_REPLY; exp->expectfn = ip_nat_follow_master; if (ip_conntrack_expect_related(exp) != 0) diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 75c27e92f6a..45886c8475e 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1892,7 +1892,7 @@ static int ipt_get_matches(char *buffer, char **start, off_t offset, int length) return pos; } -static struct { char *name; get_info_t *get_info; } ipt_proc_entry[] = +static const struct { char *name; get_info_t *get_info; } ipt_proc_entry[] = { { "ip_tables_names", ipt_get_tables }, { "ip_tables_targets", ipt_get_targets }, { "ip_tables_matches", ipt_get_matches }, diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index 92ed050fac6..30be0f1dae3 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -197,7 +197,7 @@ static void dump_packet(const struct nf_loginfo *info, } case IPPROTO_ICMP: { struct icmphdr _icmph, *ich; - static size_t required_len[NR_ICMP_TYPES+1] + static const size_t required_len[NR_ICMP_TYPES+1] = { [ICMP_ECHOREPLY] = 4, [ICMP_DEST_UNREACH] = 8 + sizeof(struct iphdr), @@ -351,7 +351,7 @@ static void dump_packet(const struct nf_loginfo *info, /* maxlen = 230+ 91 + 230 + 252 = 803 */ } -struct nf_loginfo default_loginfo = { +static struct nf_loginfo default_loginfo = { .type = NF_LOG_TYPE_LOG, .u = { .log = { diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c index 2d44b07688a..261cbb4d4c4 100644 --- a/net/ipv4/netfilter/ipt_recent.c +++ b/net/ipv4/netfilter/ipt_recent.c @@ -532,6 +532,7 @@ match(const struct sk_buff *skb, } if(info->seconds && info->hit_count) { for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) { + if(r_list[location].last_pkts[pkt_count] == 0) break; if(time_before_eq(now,r_list[location].last_pkts[pkt_count]+info->seconds*HZ)) hits_found++; } if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index a65e508fbd4..0d7dc668db4 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -98,7 +98,7 @@ fold_field(void *mib[], int offt) } /* snmp items */ -static struct snmp_mib snmp4_ipstats_list[] = { +static const struct snmp_mib snmp4_ipstats_list[] = { SNMP_MIB_ITEM("InReceives", IPSTATS_MIB_INRECEIVES), SNMP_MIB_ITEM("InHdrErrors", IPSTATS_MIB_INHDRERRORS), SNMP_MIB_ITEM("InAddrErrors", IPSTATS_MIB_INADDRERRORS), @@ -119,7 +119,7 @@ static struct snmp_mib snmp4_ipstats_list[] = { SNMP_MIB_SENTINEL }; -static struct snmp_mib snmp4_icmp_list[] = { +static const struct snmp_mib snmp4_icmp_list[] = { SNMP_MIB_ITEM("InMsgs", ICMP_MIB_INMSGS), SNMP_MIB_ITEM("InErrors", ICMP_MIB_INERRORS), SNMP_MIB_ITEM("InDestUnreachs", ICMP_MIB_INDESTUNREACHS), @@ -149,7 +149,7 @@ static struct snmp_mib snmp4_icmp_list[] = { SNMP_MIB_SENTINEL }; -static struct snmp_mib snmp4_tcp_list[] = { +static const struct snmp_mib snmp4_tcp_list[] = { SNMP_MIB_ITEM("RtoAlgorithm", TCP_MIB_RTOALGORITHM), SNMP_MIB_ITEM("RtoMin", TCP_MIB_RTOMIN), SNMP_MIB_ITEM("RtoMax", TCP_MIB_RTOMAX), @@ -167,7 +167,7 @@ static struct snmp_mib snmp4_tcp_list[] = { SNMP_MIB_SENTINEL }; -static struct snmp_mib snmp4_udp_list[] = { +static const struct snmp_mib snmp4_udp_list[] = { SNMP_MIB_ITEM("InDatagrams", UDP_MIB_INDATAGRAMS), SNMP_MIB_ITEM("NoPorts", UDP_MIB_NOPORTS), SNMP_MIB_ITEM("InErrors", UDP_MIB_INERRORS), @@ -175,7 +175,7 @@ static struct snmp_mib snmp4_udp_list[] = { SNMP_MIB_SENTINEL }; -static struct snmp_mib snmp4_net_list[] = { +static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("SyncookiesSent", LINUX_MIB_SYNCOOKIESSENT), SNMP_MIB_ITEM("SyncookiesRecv", LINUX_MIB_SYNCOOKIESRECV), SNMP_MIB_ITEM("SyncookiesFailed", LINUX_MIB_SYNCOOKIESFAILED), diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 381dd6a6aeb..f701a136a6a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1371,7 +1371,7 @@ out: kfree_skb(skb); * are needed for AMPRnet AX.25 paths. */ -static unsigned short mtu_plateau[] = +static const unsigned short mtu_plateau[] = {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; static __inline__ unsigned short guess_mtu(unsigned short old_mtu) @@ -3149,8 +3149,7 @@ int __init ip_rt_init(void) sizeof(struct rt_hash_bucket), rhash_entries, (num_physpages >= 128 * 1024) ? - (27 - PAGE_SHIFT) : - (29 - PAGE_SHIFT), + 15 : 17, HASH_HIGHMEM, &rt_hash_log, &rt_hash_mask, diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 65268562351..01444a02b48 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -645,6 +645,14 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_tcp_congestion_control, .strategy = &sysctl_tcp_congestion_control, }, + { + .ctl_name = NET_TCP_ABC, + .procname = "tcp_abc", + .data = &sysctl_tcp_abc, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = 0 } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 72b7c22e1ea..ef98b14ac56 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1413,7 +1413,7 @@ recv_urg: * closed. */ -static unsigned char new_state[16] = { +static const unsigned char new_state[16] = { /* current state: new state: action: */ /* (Invalid) */ TCP_CLOSE, /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, @@ -1640,7 +1640,7 @@ int tcp_disconnect(struct sock *sk, int flags) } else if (tcp_need_reset(old_state) || (tp->snd_nxt != tp->write_seq && (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { - /* The last check adjusts for discrepance of Linux wrt. RFC + /* The last check adjusts for discrepancy of Linux wrt. RFC * states */ tcp_send_active_reset(sk, gfp_any()); @@ -1669,6 +1669,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->packets_out = 0; tp->snd_ssthresh = 0x7fffffff; tp->snd_cwnd_cnt = 0; + tp->bytes_acked = 0; tcp_set_ca_state(sk, TCP_CA_Open); tcp_clear_retrans(tp); inet_csk_delack_init(sk); @@ -2064,8 +2065,7 @@ void __init tcp_init(void) sizeof(struct inet_ehash_bucket), thash_entries, (num_physpages >= 128 * 1024) ? - (25 - PAGE_SHIFT) : - (27 - PAGE_SHIFT), + 13 : 15, HASH_HIGHMEM, &tcp_hashinfo.ehash_size, NULL, @@ -2081,8 +2081,7 @@ void __init tcp_init(void) sizeof(struct inet_bind_hashbucket), tcp_hashinfo.ehash_size, (num_physpages >= 128 * 1024) ? - (25 - PAGE_SHIFT) : - (27 - PAGE_SHIFT), + 13 : 15, HASH_HIGHMEM, &tcp_hashinfo.bhash_size, NULL, diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index ae35e060904..1d0cd86621b 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -217,17 +217,15 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, bictcp_low_utilization(sk, data_acked); - if (in_flight < tp->snd_cwnd) + if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* In "safe" area, increase. */ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { bictcp_update(ca, tp->snd_cwnd); - /* In dangerous area, increase slowly. + /* In dangerous area, increase slowly. * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */ if (tp->snd_cwnd_cnt >= ca->cnt) { diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index bbf2d6624e8..c7cc62c8dc1 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -186,24 +186,32 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, { struct tcp_sock *tp = tcp_sk(sk); - if (in_flight < tp->snd_cwnd) + if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* In "safe" area, increase. */ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { - /* In dangerous area, increase slowly. - * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd - */ - if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - } else - tp->snd_cwnd_cnt++; - } + /* In "safe" area, increase. */ + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + + /* In dangerous area, increase slowly. */ + else if (sysctl_tcp_abc) { + /* RFC3465: Apppriate Byte Count + * increase once for each full cwnd acked + */ + if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) { + tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache; + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } + } else { + /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */ + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } } EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 6acc04bde08..63cf7e54084 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -111,18 +111,17 @@ static void hstcp_init(struct sock *sk) } static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, - u32 in_flight, int good) + u32 in_flight, int data_acked) { struct tcp_sock *tp = tcp_sk(sk); struct hstcp *ca = inet_csk_ca(sk); - if (in_flight < tp->snd_cwnd) + if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { /* Update AIMD parameters */ if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index e47b37984e9..3284cfb993e 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -207,14 +207,13 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, struct tcp_sock *tp = tcp_sk(sk); struct htcp *ca = inet_csk_ca(sk); - if (in_flight < tp->snd_cwnd) + if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* In "safe" area, increase. */ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { + measure_rtt(sk); /* keep track of number of round-trip times since last backoff event */ @@ -224,7 +223,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, htcp_alpha_update(ca); } - /* In dangerous area, increase slowly. + /* In dangerous area, increase slowly. * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd */ if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) { diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index 77add63623d..40dbb387751 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -100,12 +100,12 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt, ca->minrtt = tp->srtt; } + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + if (!ca->hybla_en) return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); - if (in_flight < tp->snd_cwnd) - return; - if (ca->rho == 0) hybla_recalc_param(sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3e98b57578d..bf2e23086bc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -42,7 +42,7 @@ * Andi Kleen : Moved open_request checking here * and process RSTs for open_requests. * Andi Kleen : Better prune_queue, and other fixes. - * Andrey Savochkin: Fix RTT measurements in the presnce of + * Andrey Savochkin: Fix RTT measurements in the presence of * timestamps. * Andrey Savochkin: Check sequence numbers correctly when * removing SACKs due to in sequence incoming @@ -89,6 +89,7 @@ int sysctl_tcp_frto; int sysctl_tcp_nometrics_save; int sysctl_tcp_moderate_rcvbuf = 1; +int sysctl_tcp_abc = 1; #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ @@ -223,7 +224,7 @@ static void tcp_fixup_sndbuf(struct sock *sk) * of receiver window. Check #2. * * The scheme does not work when sender sends good segments opening - * window and then starts to feed us spagetti. But it should work + * window and then starts to feed us spaghetti. But it should work * in common situations. Otherwise, we have to rely on queue collapsing. */ @@ -233,7 +234,7 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp, { /* Optimize this! */ int truesize = tcp_win_from_space(skb->truesize)/2; - int window = tcp_full_space(sk)/2; + int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2; while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) @@ -277,7 +278,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); /* Try to select rcvbuf so that 4 mss-sized segments - * will fit to window and correspoding skbs will fit to our rcvbuf. + * will fit to window and corresponding skbs will fit to our rcvbuf. * (was 3; 4 is minimum to allow fast retransmit to work.) */ while (tcp_win_from_space(rcvmem) < tp->advmss) @@ -286,7 +287,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]); } -/* 4. Try to fixup all. It is made iimediately after connection enters +/* 4. Try to fixup all. It is made immediately after connection enters * established state. */ static void tcp_init_buffer_space(struct sock *sk) @@ -326,37 +327,18 @@ static void tcp_init_buffer_space(struct sock *sk) static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) { struct inet_connection_sock *icsk = inet_csk(sk); - struct sk_buff *skb; - unsigned int app_win = tp->rcv_nxt - tp->copied_seq; - int ofo_win = 0; icsk->icsk_ack.quick = 0; - skb_queue_walk(&tp->out_of_order_queue, skb) { - ofo_win += skb->len; - } - - /* If overcommit is due to out of order segments, - * do not clamp window. Try to expand rcvbuf instead. - */ - if (ofo_win) { - if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && - !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && - !tcp_memory_pressure && - atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) - sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), - sysctl_tcp_rmem[2]); + if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && + !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && + !tcp_memory_pressure && + atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { + sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), + sysctl_tcp_rmem[2]); } - if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) { - app_win += ofo_win; - if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf) - app_win >>= 1; - if (app_win > icsk->icsk_ack.rcv_mss) - app_win -= icsk->icsk_ack.rcv_mss; - app_win = max(app_win, 2U*tp->advmss); - + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); - } } /* Receiver "autotuning" code. @@ -385,8 +367,8 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) * are stalled on filesystem I/O. * * Also, since we are only going for a minimum in the - * non-timestamp case, we do not smoothe things out - * else with timestamps disabled convergance takes too + * non-timestamp case, we do not smooth things out + * else with timestamps disabled convergence takes too * long. */ if (!win_dep) { @@ -395,7 +377,7 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) } else if (m < new_sample) new_sample = m << 3; } else { - /* No previous mesaure. */ + /* No previous measure. */ new_sample = m << 3; } @@ -524,7 +506,7 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ if (icsk->icsk_ack.ato > icsk->icsk_rto) icsk->icsk_ack.ato = icsk->icsk_rto; } else if (m > icsk->icsk_rto) { - /* Too long gap. Apparently sender falled to + /* Too long gap. Apparently sender failed to * restart window, so that we send ACKs quickly. */ tcp_incr_quickack(sk); @@ -548,10 +530,9 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ -static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) +static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) { struct tcp_sock *tp = tcp_sk(sk); - const struct inet_connection_sock *icsk = inet_csk(sk); long m = mrtt; /* RTT */ /* The following amusing code comes from Jacobson's @@ -565,7 +546,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) * * Funny. This algorithm seems to be very broken. * These formulae increase RTO, when it should be decreased, increase - * too slowly, when it should be incresed fastly, decrease too fastly + * too slowly, when it should be increased quickly, decrease too quickly * etc. I guess in BSD RTO takes ONE value, so that it is absolutely * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) @@ -610,9 +591,6 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); tp->rtt_seq = tp->snd_nxt; } - - if (icsk->icsk_ca_ops->rtt_sample) - icsk->icsk_ca_ops->rtt_sample(sk, *usrtt); } /* Calculate rto without backoff. This is the second half of Van Jacobson's @@ -629,14 +607,14 @@ static inline void tcp_set_rto(struct sock *sk) * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ * to do with delayed acks, because at cwnd>2 true delack timeout * is invisible. Actually, Linux-2.4 also generates erratic - * ACKs in some curcumstances. + * ACKs in some circumstances. */ inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar; /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, * all the algo is pure shit and should be replaced - * with correct one. It is exaclty, which we pretend to do. + * with correct one. It is exactly, which we pretend to do. */ } @@ -794,7 +772,7 @@ static void tcp_init_metrics(struct sock *sk) * to make it more realistic. * * A bit of theory. RTT is time passed after "normal" sized packet - * is sent until it is ACKed. In normal curcumstances sending small + * is sent until it is ACKed. In normal circumstances sending small * packets force peer to delay ACKs and calculation is correct too. * The algorithm is adaptive and, provided we follow specs, it * NEVER underestimate RTT. BUT! If peer tries to make some clever @@ -919,18 +897,32 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ int prior_fackets; u32 lost_retrans = 0; int flag = 0; + int dup_sack = 0; int i; if (!tp->sacked_out) tp->fackets_out = 0; prior_fackets = tp->fackets_out; - for (i=0; i<num_sacks; i++, sp++) { - struct sk_buff *skb; - __u32 start_seq = ntohl(sp->start_seq); - __u32 end_seq = ntohl(sp->end_seq); - int fack_count = 0; - int dup_sack = 0; + /* SACK fastpath: + * if the only SACK change is the increase of the end_seq of + * the first block then only apply that SACK block + * and use retrans queue hinting otherwise slowpath */ + flag = 1; + for (i = 0; i< num_sacks; i++) { + __u32 start_seq = ntohl(sp[i].start_seq); + __u32 end_seq = ntohl(sp[i].end_seq); + + if (i == 0){ + if (tp->recv_sack_cache[i].start_seq != start_seq) + flag = 0; + } else { + if ((tp->recv_sack_cache[i].start_seq != start_seq) || + (tp->recv_sack_cache[i].end_seq != end_seq)) + flag = 0; + } + tp->recv_sack_cache[i].start_seq = start_seq; + tp->recv_sack_cache[i].end_seq = end_seq; /* Check for D-SACK. */ if (i == 0) { @@ -962,15 +954,58 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (before(ack, prior_snd_una - tp->max_window)) return 0; } + } + + if (flag) + num_sacks = 1; + else { + int j; + tp->fastpath_skb_hint = NULL; + + /* order SACK blocks to allow in order walk of the retrans queue */ + for (i = num_sacks-1; i > 0; i--) { + for (j = 0; j < i; j++){ + if (after(ntohl(sp[j].start_seq), + ntohl(sp[j+1].start_seq))){ + sp[j].start_seq = htonl(tp->recv_sack_cache[j+1].start_seq); + sp[j].end_seq = htonl(tp->recv_sack_cache[j+1].end_seq); + sp[j+1].start_seq = htonl(tp->recv_sack_cache[j].start_seq); + sp[j+1].end_seq = htonl(tp->recv_sack_cache[j].end_seq); + } + + } + } + } + + /* clear flag as used for different purpose in following code */ + flag = 0; + + for (i=0; i<num_sacks; i++, sp++) { + struct sk_buff *skb; + __u32 start_seq = ntohl(sp->start_seq); + __u32 end_seq = ntohl(sp->end_seq); + int fack_count; + + /* Use SACK fastpath hint if valid */ + if (tp->fastpath_skb_hint) { + skb = tp->fastpath_skb_hint; + fack_count = tp->fastpath_cnt_hint; + } else { + skb = sk->sk_write_queue.next; + fack_count = 0; + } /* Event "B" in the comment above. */ if (after(end_seq, tp->high_seq)) flag |= FLAG_DATA_LOST; - sk_stream_for_retrans_queue(skb, sk) { + sk_stream_for_retrans_queue_from(skb, sk) { int in_sack, pcount; u8 sacked; + tp->fastpath_skb_hint = skb; + tp->fastpath_cnt_hint = fack_count; + /* The retransmission queue is always in order, so * we can short-circuit the walk early. */ @@ -1045,6 +1080,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); tp->lost_out -= tcp_skb_pcount(skb); tp->retrans_out -= tcp_skb_pcount(skb); + + /* clear lost hint */ + tp->retransmit_skb_hint = NULL; } } else { /* New sack for not retransmitted frame, @@ -1057,6 +1095,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (sacked & TCPCB_LOST) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; tp->lost_out -= tcp_skb_pcount(skb); + + /* clear lost hint */ + tp->retransmit_skb_hint = NULL; } } @@ -1080,6 +1121,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); + tp->retransmit_skb_hint = NULL; } } } @@ -1107,6 +1149,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); + /* clear lost hint */ + tp->retransmit_skb_hint = NULL; + if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) { tp->lost_out += tcp_skb_pcount(skb); TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; @@ -1214,6 +1259,8 @@ static void tcp_enter_frto_loss(struct sock *sk) tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->frto_highmark; TCP_ECN_queue_cwr(tp); + + clear_all_retrans_hints(tp); } void tcp_clear_retrans(struct tcp_sock *tp) @@ -1251,6 +1298,7 @@ void tcp_enter_loss(struct sock *sk, int how) tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; + tp->bytes_acked = 0; tcp_clear_retrans(tp); /* Push undo marker, if it was plain RTO and nothing @@ -1279,6 +1327,8 @@ void tcp_enter_loss(struct sock *sk, int how) tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp); + + clear_all_retrans_hints(tp); } static int tcp_check_sack_reneging(struct sock *sk) @@ -1503,17 +1553,37 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp, int packets, u32 high_seq) { struct sk_buff *skb; - int cnt = packets; + int cnt; - BUG_TRAP(cnt <= tp->packets_out); + BUG_TRAP(packets <= tp->packets_out); + if (tp->lost_skb_hint) { + skb = tp->lost_skb_hint; + cnt = tp->lost_cnt_hint; + } else { + skb = sk->sk_write_queue.next; + cnt = 0; + } - sk_stream_for_retrans_queue(skb, sk) { - cnt -= tcp_skb_pcount(skb); - if (cnt < 0 || after(TCP_SKB_CB(skb)->end_seq, high_seq)) + sk_stream_for_retrans_queue_from(skb, sk) { + /* TODO: do this better */ + /* this is not the most efficient way to do this... */ + tp->lost_skb_hint = skb; + tp->lost_cnt_hint = cnt; + cnt += tcp_skb_pcount(skb); + if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, high_seq)) break; if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); + + /* clear xmit_retransmit_queue hints + * if this is beyond hint */ + if(tp->retransmit_skb_hint != NULL && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) { + + tp->retransmit_skb_hint = NULL; + } } } tcp_sync_left_out(tp); @@ -1540,13 +1610,28 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) if (tcp_head_timedout(sk, tp)) { struct sk_buff *skb; - sk_stream_for_retrans_queue(skb, sk) { - if (tcp_skb_timedout(sk, skb) && - !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { + skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint + : sk->sk_write_queue.next; + + sk_stream_for_retrans_queue_from(skb, sk) { + if (!tcp_skb_timedout(sk, skb)) + break; + + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); + + /* clear xmit_retrans hint */ + if (tp->retransmit_skb_hint && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) + + tp->retransmit_skb_hint = NULL; } } + + tp->scoreboard_skb_hint = skb; + tcp_sync_left_out(tp); } } @@ -1626,6 +1711,10 @@ static void tcp_undo_cwr(struct sock *sk, const int undo) } tcp_moderate_cwnd(tp); tp->snd_cwnd_stamp = tcp_time_stamp; + + /* There is something screwy going on with the retrans hints after + an undo */ + clear_all_retrans_hints(tp); } static inline int tcp_may_undo(struct tcp_sock *tp) @@ -1709,6 +1798,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) sk_stream_for_retrans_queue(skb, sk) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; } + + clear_all_retrans_hints(tp); + DBGUNDO(sk, tp, "partial loss"); tp->lost_out = 0; tp->left_out = tp->sacked_out; @@ -1908,6 +2000,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, TCP_ECN_queue_cwr(tp); } + tp->bytes_acked = 0; tp->snd_cwnd_cnt = 0; tcp_set_ca_state(sk, TCP_CA_Recovery); } @@ -1919,9 +2012,9 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, } /* Read draft-ietf-tcplw-high-performance before mucking - * with this code. (Superceeds RFC1323) + * with this code. (Supersedes RFC1323) */ -static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) +static void tcp_ack_saw_tstamp(struct sock *sk, int flag) { /* RTTM Rule: A TSecr value received in a segment is used to * update the averaged RTT measurement only if the segment @@ -1932,7 +2025,7 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) * 1998/04/10 Andrey V. Savochkin <saw@msu.ru> * * Changed: reset backoff as soon as we see the first valid sample. - * If we do not, we get strongly overstimated rto. With timestamps + * If we do not, we get strongly overestimated rto. With timestamps * samples are accepted even from very old segments: f.e., when rtt=1 * increases to 8, we retransmit 5 times and after 8 seconds delayed * answer arrives rto becomes 120 seconds! If at least one of segments @@ -1940,13 +2033,13 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) */ struct tcp_sock *tp = tcp_sk(sk); const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; - tcp_rtt_estimator(sk, seq_rtt, usrtt); + tcp_rtt_estimator(sk, seq_rtt); tcp_set_rto(sk); inet_csk(sk)->icsk_backoff = 0; tcp_bound_rto(sk); } -static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag) +static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) { /* We don't have a timestamp. Can only use * packets that are not retransmitted to determine @@ -1960,21 +2053,21 @@ static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag if (flag & FLAG_RETRANS_DATA_ACKED) return; - tcp_rtt_estimator(sk, seq_rtt, usrtt); + tcp_rtt_estimator(sk, seq_rtt); tcp_set_rto(sk); inet_csk(sk)->icsk_backoff = 0; tcp_bound_rto(sk); } static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, - const s32 seq_rtt, u32 *usrtt) + const s32 seq_rtt) { const struct tcp_sock *tp = tcp_sk(sk); /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) - tcp_ack_saw_tstamp(sk, usrtt, flag); + tcp_ack_saw_tstamp(sk, flag); else if (seq_rtt >= 0) - tcp_ack_no_tstamp(sk, seq_rtt, usrtt, flag); + tcp_ack_no_tstamp(sk, seq_rtt, flag); } static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, @@ -2054,20 +2147,27 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, return acked; } +static inline u32 tcp_usrtt(const struct sk_buff *skb) +{ + struct timeval tv, now; + + do_gettimeofday(&now); + skb_get_timestamp(skb, &tv); + return (now.tv_sec - tv.tv_sec) * 1000000 + (now.tv_usec - tv.tv_usec); +} /* Remove acknowledged frames from the retransmission queue. */ -static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt) +static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) { struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb; __u32 now = tcp_time_stamp; int acked = 0; __s32 seq_rtt = -1; - struct timeval usnow; u32 pkts_acked = 0; - - if (seq_usrtt) - do_gettimeofday(&usnow); + void (*rtt_sample)(struct sock *sk, u32 usrtt) + = icsk->icsk_ca_ops->rtt_sample; while ((skb = skb_peek(&sk->sk_write_queue)) && skb != sk->sk_send_head) { @@ -2107,16 +2207,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt tp->retrans_out -= tcp_skb_pcount(skb); acked |= FLAG_RETRANS_DATA_ACKED; seq_rtt = -1; - } else if (seq_rtt < 0) + } else if (seq_rtt < 0) { seq_rtt = now - scb->when; - if (seq_usrtt) { - struct timeval tv; - - skb_get_timestamp(skb, &tv); - *seq_usrtt = (usnow.tv_sec - tv.tv_sec) * 1000000 - + (usnow.tv_usec - tv.tv_usec); + if (rtt_sample) + (*rtt_sample)(sk, tcp_usrtt(skb)); } - if (sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= tcp_skb_pcount(skb); if (sacked & TCPCB_LOST) @@ -2126,17 +2221,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt !before(scb->end_seq, tp->snd_up)) tp->urg_mode = 0; } - } else if (seq_rtt < 0) + } else if (seq_rtt < 0) { seq_rtt = now - scb->when; + if (rtt_sample) + (*rtt_sample)(sk, tcp_usrtt(skb)); + } tcp_dec_pcount_approx(&tp->fackets_out, skb); tcp_packets_out_dec(tp, skb); __skb_unlink(skb, &sk->sk_write_queue); sk_stream_free_skb(sk, skb); + clear_all_retrans_hints(tp); } if (acked&FLAG_ACKED) { - const struct inet_connection_sock *icsk = inet_csk(sk); - tcp_ack_update_rtt(sk, acked, seq_rtt, seq_usrtt); + tcp_ack_update_rtt(sk, acked, seq_rtt); tcp_ack_packets_out(sk, tp); if (icsk->icsk_ca_ops->pkts_acked) @@ -2284,7 +2382,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) } /* F-RTO affects on two new ACKs following RTO. - * At latest on third ACK the TCP behavor is back to normal. + * At latest on third ACK the TCP behavior is back to normal. */ tp->frto_counter = (tp->frto_counter + 1) % 3; } @@ -2299,7 +2397,6 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) u32 ack = TCP_SKB_CB(skb)->ack_seq; u32 prior_in_flight; s32 seq_rtt; - s32 seq_usrtt = 0; int prior_packets; /* If the ack is newer than sent or older than previous acks @@ -2311,6 +2408,9 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) if (before(ack, prior_snd_una)) goto old_ack; + if (sysctl_tcp_abc && icsk->icsk_ca_state < TCP_CA_CWR) + tp->bytes_acked += ack - prior_snd_una; + if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) { /* Window is constant, pure forward advance. * No more checks are required. @@ -2352,14 +2452,13 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) prior_in_flight = tcp_packets_in_flight(tp); /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, &seq_rtt, - icsk->icsk_ca_ops->rtt_sample ? &seq_usrtt : NULL); + flag |= tcp_clean_rtx_queue(sk, &seq_rtt); if (tp->frto_counter) tcp_process_frto(sk, prior_snd_una); if (tcp_ack_is_dubious(sk, flag)) { - /* Advanve CWND, if state allows this. */ + /* Advance CWND, if state allows this. */ if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0); tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); @@ -3148,7 +3247,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, { struct sk_buff *skb; - /* First, check that queue is collapsable and find + /* First, check that queue is collapsible and find * the point where collapsing can be useful. */ for (skb = head; skb != tail; ) { /* No new bits? It is possible on ofo queue. */ @@ -3456,7 +3555,7 @@ static __inline__ void tcp_ack_snd_check(struct sock *sk) /* * This routine is only called when we have urgent data - * signalled. Its the 'slow' part of tcp_urg. It could be + * signaled. Its the 'slow' part of tcp_urg. It could be * moved inline now as tcp_urg is only called from one * place. We handle URGent data wrong. We have to - as * BSD still doesn't use the correction from RFC961. @@ -3501,7 +3600,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th) * urgent. To do this requires some care. We cannot just ignore * tp->copied_seq since we would read the last urgent byte again * as data, nor can we alter copied_seq until this data arrives - * or we break the sematics of SIOCATMARK (and thus sockatmark()) + * or we break the semantics of SIOCATMARK (and thus sockatmark()) * * NOTE. Double Dutch. Rendering to plain English: author of comment * above did something sort of send("A", MSG_OOB); send("B", MSG_OOB); @@ -3646,7 +3745,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tp->rx_opt.saw_tstamp = 0; /* pred_flags is 0xS?10 << 16 + snd_wnd - * if header_predition is to be made + * if header_prediction is to be made * 'S' will always be tp->tcp_header_len >> 2 * '?' will be 0 for the fast path, otherwise pred_flags is 0 to * turn it off (when there are holes in the receive @@ -4242,7 +4341,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !tp->srtt) - tcp_ack_saw_tstamp(sk, NULL, 0); + tcp_ack_saw_tstamp(sk, 0); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; @@ -4372,6 +4471,7 @@ discard: EXPORT_SYMBOL(sysctl_tcp_ecn); EXPORT_SYMBOL(sysctl_tcp_reordering); +EXPORT_SYMBOL(sysctl_tcp_abc); EXPORT_SYMBOL(tcp_parse_options); EXPORT_SYMBOL(tcp_rcv_established); EXPORT_SYMBOL(tcp_rcv_state_process); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 634dabb558f..4d5021e1929 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -39,7 +39,7 @@ * request_sock handling and moved * most of it into the af independent code. * Added tail drop and some other bugfixes. - * Added new listen sematics. + * Added new listen semantics. * Mike McLagan : Routing by source * Juan Jose Ciarlante: ip_dynaddr bits * Andi Kleen: various fixes. @@ -1110,24 +1110,18 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) static int tcp_v4_checksum_init(struct sk_buff *skb) { if (skb->ip_summed == CHECKSUM_HW) { - skb->ip_summed = CHECKSUM_UNNECESSARY; if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr, - skb->nh.iph->daddr, skb->csum)) + skb->nh.iph->daddr, skb->csum)) { + skb->ip_summed = CHECKSUM_UNNECESSARY; return 0; - - LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v4 csum failed\n"); - skb->ip_summed = CHECKSUM_NONE; + } } + + skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr, + skb->len, IPPROTO_TCP, 0); + if (skb->len <= 76) { - if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr, - skb->nh.iph->daddr, - skb_checksum(skb, 0, skb->len, 0))) - return -1; - skb->ip_summed = CHECKSUM_UNNECESSARY; - } else { - skb->csum = ~tcp_v4_check(skb->h.th, skb->len, - skb->nh.iph->saddr, - skb->nh.iph->daddr, 0); + return __skb_checksum_complete(skb); } return 0; } @@ -1216,10 +1210,10 @@ int tcp_v4_rcv(struct sk_buff *skb) /* An explanation is required here, I think. * Packet length and doff are validated by header prediction, - * provided case of th->doff==0 is elimineted. + * provided case of th->doff==0 is eliminated. * So, we defer the checks. */ if ((skb->ip_summed != CHECKSUM_UNNECESSARY && - tcp_v4_checksum_init(skb) < 0)) + tcp_v4_checksum_init(skb))) goto bad_packet; th = skb->h.th; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b1a63b2c6b4..1b66a2ac432 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -158,7 +158,7 @@ kill_with_rst: /* I am shamed, but failed to make it more elegant. * Yes, it is direct reference to IP, which is impossible * to generalize to IPv6. Taking into account that IPv6 - * do not undertsnad recycling in any case, it not + * do not understand recycling in any case, it not * a big problem in practice. --ANK */ if (tw->tw_family == AF_INET && tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && @@ -194,7 +194,7 @@ kill_with_rst: /* In window segment, it may be only reset or bare ack. */ if (th->rst) { - /* This is TIME_WAIT assasination, in two flavors. + /* This is TIME_WAIT assassination, in two flavors. * Oh well... nobody has a sufficient solution to this * protocol bug yet. */ @@ -380,6 +380,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, */ newtp->snd_cwnd = 2; newtp->snd_cwnd_cnt = 0; + newtp->bytes_acked = 0; newtp->frto_counter = 0; newtp->frto_highmark = 0; @@ -550,7 +551,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, /* RFC793 page 36: "If the connection is in any non-synchronized state ... * and the incoming segment acknowledges something not yet - * sent (the segment carries an unaccaptable ACK) ... + * sent (the segment carries an unacceptable ACK) ... * a reset is sent." * * Invalid ACK: reset will be sent by listening socket diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b907456a79f..b7325e0b406 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -262,122 +262,139 @@ static __inline__ u16 tcp_select_window(struct sock *sk) * We are working here with either a clone of the original * SKB, or a fresh unique copy made by the retransmit engine. */ -static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) +static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask) { - if (skb != NULL) { - const struct inet_connection_sock *icsk = inet_csk(sk); - struct inet_sock *inet = inet_sk(sk); - struct tcp_sock *tp = tcp_sk(sk); - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - int tcp_header_size = tp->tcp_header_len; - struct tcphdr *th; - int sysctl_flags; - int err; + const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_sock *inet; + struct tcp_sock *tp; + struct tcp_skb_cb *tcb; + int tcp_header_size; + struct tcphdr *th; + int sysctl_flags; + int err; + + BUG_ON(!skb || !tcp_skb_pcount(skb)); - BUG_ON(!tcp_skb_pcount(skb)); + /* If congestion control is doing timestamping, we must + * take such a timestamp before we potentially clone/copy. + */ + if (icsk->icsk_ca_ops->rtt_sample) + __net_timestamp(skb); + + if (likely(clone_it)) { + if (unlikely(skb_cloned(skb))) + skb = pskb_copy(skb, gfp_mask); + else + skb = skb_clone(skb, gfp_mask); + if (unlikely(!skb)) + return -ENOBUFS; + } + + inet = inet_sk(sk); + tp = tcp_sk(sk); + tcb = TCP_SKB_CB(skb); + tcp_header_size = tp->tcp_header_len; #define SYSCTL_FLAG_TSTAMPS 0x1 #define SYSCTL_FLAG_WSCALE 0x2 #define SYSCTL_FLAG_SACK 0x4 - /* If congestion control is doing timestamping */ - if (icsk->icsk_ca_ops->rtt_sample) - __net_timestamp(skb); - - sysctl_flags = 0; - if (tcb->flags & TCPCB_FLAG_SYN) { - tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; - if(sysctl_tcp_timestamps) { - tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; - sysctl_flags |= SYSCTL_FLAG_TSTAMPS; - } - if(sysctl_tcp_window_scaling) { - tcp_header_size += TCPOLEN_WSCALE_ALIGNED; - sysctl_flags |= SYSCTL_FLAG_WSCALE; - } - if(sysctl_tcp_sack) { - sysctl_flags |= SYSCTL_FLAG_SACK; - if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS)) - tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; - } - } else if (tp->rx_opt.eff_sacks) { - /* A SACK is 2 pad bytes, a 2 byte header, plus - * 2 32-bit sequence numbers for each SACK block. - */ - tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED + - (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); + sysctl_flags = 0; + if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { + tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; + if(sysctl_tcp_timestamps) { + tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; + sysctl_flags |= SYSCTL_FLAG_TSTAMPS; } - - if (tcp_packets_in_flight(tp) == 0) - tcp_ca_event(sk, CA_EVENT_TX_START); - - th = (struct tcphdr *) skb_push(skb, tcp_header_size); - skb->h.th = th; - skb_set_owner_w(skb, sk); - - /* Build TCP header and checksum it. */ - th->source = inet->sport; - th->dest = inet->dport; - th->seq = htonl(tcb->seq); - th->ack_seq = htonl(tp->rcv_nxt); - *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags); - if (tcb->flags & TCPCB_FLAG_SYN) { - /* RFC1323: The window in SYN & SYN/ACK segments - * is never scaled. - */ - th->window = htons(tp->rcv_wnd); - } else { - th->window = htons(tcp_select_window(sk)); + if (sysctl_tcp_window_scaling) { + tcp_header_size += TCPOLEN_WSCALE_ALIGNED; + sysctl_flags |= SYSCTL_FLAG_WSCALE; } - th->check = 0; - th->urg_ptr = 0; - - if (tp->urg_mode && - between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) { - th->urg_ptr = htons(tp->snd_up-tcb->seq); - th->urg = 1; + if (sysctl_tcp_sack) { + sysctl_flags |= SYSCTL_FLAG_SACK; + if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS)) + tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; } + } else if (unlikely(tp->rx_opt.eff_sacks)) { + /* A SACK is 2 pad bytes, a 2 byte header, plus + * 2 32-bit sequence numbers for each SACK block. + */ + tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED + + (tp->rx_opt.eff_sacks * + TCPOLEN_SACK_PERBLOCK)); + } + + if (tcp_packets_in_flight(tp) == 0) + tcp_ca_event(sk, CA_EVENT_TX_START); + + th = (struct tcphdr *) skb_push(skb, tcp_header_size); + skb->h.th = th; + skb_set_owner_w(skb, sk); + + /* Build TCP header and checksum it. */ + th->source = inet->sport; + th->dest = inet->dport; + th->seq = htonl(tcb->seq); + th->ack_seq = htonl(tp->rcv_nxt); + *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | + tcb->flags); + + if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { + /* RFC1323: The window in SYN & SYN/ACK segments + * is never scaled. + */ + th->window = htons(tp->rcv_wnd); + } else { + th->window = htons(tcp_select_window(sk)); + } + th->check = 0; + th->urg_ptr = 0; - if (tcb->flags & TCPCB_FLAG_SYN) { - tcp_syn_build_options((__u32 *)(th + 1), - tcp_advertise_mss(sk), - (sysctl_flags & SYSCTL_FLAG_TSTAMPS), - (sysctl_flags & SYSCTL_FLAG_SACK), - (sysctl_flags & SYSCTL_FLAG_WSCALE), - tp->rx_opt.rcv_wscale, - tcb->when, - tp->rx_opt.ts_recent); - } else { - tcp_build_and_update_options((__u32 *)(th + 1), - tp, tcb->when); + if (unlikely(tp->urg_mode && + between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF))) { + th->urg_ptr = htons(tp->snd_up-tcb->seq); + th->urg = 1; + } - TCP_ECN_send(sk, tp, skb, tcp_header_size); - } - tp->af_specific->send_check(sk, th, skb->len, skb); + if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { + tcp_syn_build_options((__u32 *)(th + 1), + tcp_advertise_mss(sk), + (sysctl_flags & SYSCTL_FLAG_TSTAMPS), + (sysctl_flags & SYSCTL_FLAG_SACK), + (sysctl_flags & SYSCTL_FLAG_WSCALE), + tp->rx_opt.rcv_wscale, + tcb->when, + tp->rx_opt.ts_recent); + } else { + tcp_build_and_update_options((__u32 *)(th + 1), + tp, tcb->when); + TCP_ECN_send(sk, tp, skb, tcp_header_size); + } - if (tcb->flags & TCPCB_FLAG_ACK) - tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); + tp->af_specific->send_check(sk, th, skb->len, skb); - if (skb->len != tcp_header_size) - tcp_event_data_sent(tp, skb, sk); + if (likely(tcb->flags & TCPCB_FLAG_ACK)) + tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); - TCP_INC_STATS(TCP_MIB_OUTSEGS); + if (skb->len != tcp_header_size) + tcp_event_data_sent(tp, skb, sk); - err = tp->af_specific->queue_xmit(skb, 0); - if (err <= 0) - return err; + TCP_INC_STATS(TCP_MIB_OUTSEGS); - tcp_enter_cwr(sk); + err = tp->af_specific->queue_xmit(skb, 0); + if (unlikely(err <= 0)) + return err; + + tcp_enter_cwr(sk); + + /* NET_XMIT_CN is special. It does not guarantee, + * that this packet is lost. It tells that device + * is about to start to drop packets or already + * drops some packets of the same priority and + * invokes us to send less aggressively. + */ + return err == NET_XMIT_CN ? 0 : err; - /* NET_XMIT_CN is special. It does not guarantee, - * that this packet is lost. It tells that device - * is about to start to drop packets or already - * drops some packets of the same priority and - * invokes us to send less aggressively. - */ - return err == NET_XMIT_CN ? 0 : err; - } - return -ENOBUFS; #undef SYSCTL_FLAG_TSTAMPS #undef SYSCTL_FLAG_WSCALE #undef SYSCTL_FLAG_SACK @@ -436,6 +453,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss u16 flags; BUG_ON(len > skb->len); + + clear_all_retrans_hints(tp); nsize = skb_headlen(skb) - len; if (nsize < 0) nsize = 0; @@ -599,7 +618,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) for TCP options, but includes only bare TCP header. tp->rx_opt.mss_clamp is mss negotiated at connection setup. - It is minumum of user_mss and mss received with SYN. + It is minimum of user_mss and mss received with SYN. It also does not include TCP options. tp->pmtu_cookie is last pmtu, seen by this function. @@ -1034,7 +1053,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) TCP_SKB_CB(skb)->when = tcp_time_stamp; - if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))) + if (unlikely(tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC))) break; /* Advance the send_head. This one is sent out. @@ -1107,7 +1126,7 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now) /* Send it out now. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; - if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) { + if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) { update_send_head(sk, tp, skb); tcp_cwnd_validate(sk, tp); return; @@ -1171,7 +1190,7 @@ u32 __tcp_select_window(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - /* MSS for the peer's data. Previous verions used mss_clamp + /* MSS for the peer's data. Previous versions used mss_clamp * here. I don't know if the value based on our guesses * of peer's MSS is better for the performance. It's more correct * but may be worse for the performance because of rcv_mss @@ -1260,7 +1279,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); - /* Ok. We will be able to collapse the packet. */ + /* changing transmit queue under us so clear hints */ + clear_all_retrans_hints(tp); + + /* Ok. We will be able to collapse the packet. */ __skb_unlink(next_skb, &sk->sk_write_queue); memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); @@ -1330,6 +1352,8 @@ void tcp_simple_retransmit(struct sock *sk) } } + clear_all_retrans_hints(tp); + if (!lost) return; @@ -1361,7 +1385,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) int err; /* Do not sent more than we queued. 1/4 is reserved for possible - * copying overhead: frgagmentation, tunneling, mangling etc. + * copying overhead: fragmentation, tunneling, mangling etc. */ if (atomic_read(&sk->sk_wmem_alloc) > min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) @@ -1422,9 +1446,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) */ TCP_SKB_CB(skb)->when = tcp_time_stamp; - err = tcp_transmit_skb(sk, (skb_cloned(skb) ? - pskb_copy(skb, GFP_ATOMIC): - skb_clone(skb, GFP_ATOMIC))); + err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); if (err == 0) { /* Update global TCP statistics. */ @@ -1468,13 +1490,25 @@ void tcp_xmit_retransmit_queue(struct sock *sk) const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - int packet_cnt = tp->lost_out; + int packet_cnt; + + if (tp->retransmit_skb_hint) { + skb = tp->retransmit_skb_hint; + packet_cnt = tp->retransmit_cnt_hint; + }else{ + skb = sk->sk_write_queue.next; + packet_cnt = 0; + } /* First pass: retransmit lost packets. */ - if (packet_cnt) { - sk_stream_for_retrans_queue(skb, sk) { + if (tp->lost_out) { + sk_stream_for_retrans_queue_from(skb, sk) { __u8 sacked = TCP_SKB_CB(skb)->sacked; + /* we could do better than to assign each time */ + tp->retransmit_skb_hint = skb; + tp->retransmit_cnt_hint = packet_cnt; + /* Assume this retransmit will generate * only one packet for congestion window * calculation purposes. This works because @@ -1485,10 +1519,12 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) return; - if (sacked&TCPCB_LOST) { + if (sacked & TCPCB_LOST) { if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { - if (tcp_retransmit_skb(sk, skb)) + if (tcp_retransmit_skb(sk, skb)) { + tp->retransmit_skb_hint = NULL; return; + } if (icsk->icsk_ca_state != TCP_CA_Loss) NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); else @@ -1501,8 +1537,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk) TCP_RTO_MAX); } - packet_cnt -= tcp_skb_pcount(skb); - if (packet_cnt <= 0) + packet_cnt += tcp_skb_pcount(skb); + if (packet_cnt >= tp->lost_out) break; } } @@ -1528,9 +1564,18 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (tcp_may_send_now(sk, tp)) return; - packet_cnt = 0; + if (tp->forward_skb_hint) { + skb = tp->forward_skb_hint; + packet_cnt = tp->forward_cnt_hint; + } else{ + skb = sk->sk_write_queue.next; + packet_cnt = 0; + } + + sk_stream_for_retrans_queue_from(skb, sk) { + tp->forward_cnt_hint = packet_cnt; + tp->forward_skb_hint = skb; - sk_stream_for_retrans_queue(skb, sk) { /* Similar to the retransmit loop above we * can pretend that the retransmitted SKB * we send out here will be composed of one @@ -1547,8 +1592,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk) continue; /* Ok, retransmit it. */ - if (tcp_retransmit_skb(sk, skb)) + if (tcp_retransmit_skb(sk, skb)) { + tp->forward_skb_hint = NULL; break; + } if (skb == skb_peek(&sk->sk_write_queue)) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, @@ -1633,7 +1680,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp); TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(skb)->when = tcp_time_stamp; - if (tcp_transmit_skb(sk, skb)) + if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); } @@ -1668,7 +1715,7 @@ int tcp_send_synack(struct sock *sk) TCP_ECN_send_synack(tcp_sk(sk), skb); } TCP_SKB_CB(skb)->when = tcp_time_stamp; - return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); + return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } /* @@ -1829,7 +1876,7 @@ int tcp_connect(struct sock *sk) __skb_queue_tail(&sk->sk_write_queue, buff); sk_charge_skb(sk, buff); tp->packets_out += tcp_skb_pcount(buff); - tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL)); + tcp_transmit_skb(sk, buff, 1, GFP_KERNEL); TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ @@ -1925,7 +1972,7 @@ void tcp_send_ack(struct sock *sk) /* Send it off, this clears delayed acks for us. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp); TCP_SKB_CB(buff)->when = tcp_time_stamp; - tcp_transmit_skb(sk, buff); + tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC); } } @@ -1965,7 +2012,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(skb)->when = tcp_time_stamp; - return tcp_transmit_skb(sk, skb); + return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); } int tcp_write_wakeup(struct sock *sk) @@ -1998,7 +2045,7 @@ int tcp_write_wakeup(struct sock *sk) TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; TCP_SKB_CB(skb)->when = tcp_time_stamp; - err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); + err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); if (!err) { update_send_head(sk, tp, skb); } @@ -2058,3 +2105,4 @@ EXPORT_SYMBOL(tcp_connect); EXPORT_SYMBOL(tcp_make_synack); EXPORT_SYMBOL(tcp_simple_retransmit); EXPORT_SYMBOL(tcp_sync_mss); +EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor); diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 327770bf552..26d7486ee50 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -20,20 +20,20 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, int flag) { struct tcp_sock *tp = tcp_sk(sk); - if (in_flight < tp->snd_cwnd) + + if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - tp->snd_cwnd++; - } else { + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { tp->snd_cwnd_cnt++; if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ - tp->snd_cwnd++; + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; tp->snd_cwnd_cnt = 0; } } - tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); - tp->snd_cwnd_stamp = tcp_time_stamp; } static u32 tcp_scalable_ssthresh(struct sock *sk) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 415ee47ac1c..e1880959614 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -58,7 +58,7 @@ static void tcp_write_err(struct sock *sk) * to prevent DoS attacks. It is called when a retransmission timeout * or zero probe timeout occurs on orphaned socket. * - * Criterium is still not confirmed experimentally and may change. + * Criteria is still not confirmed experimentally and may change. * We kill the socket, if: * 1. If number of orphaned sockets exceeds an administratively configured * limit. @@ -132,7 +132,7 @@ static int tcp_write_timeout(struct sock *sk) hole detection. :-( It is place to make it. It is not made. I do not want - to make it. It is disguisting. It does not work in any + to make it. It is disgusting. It does not work in any case. Let me to cite the same draft, which requires for us to implement this: diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 93c5f92070f..13e7e6e8df1 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -215,14 +215,6 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, vegas->beg_snd_nxt = tp->snd_nxt; vegas->beg_snd_cwnd = tp->snd_cwnd; - /* Take into account the current RTT sample too, to - * decrease the impact of delayed acks. This double counts - * this sample since we count it for the next window as well, - * but that's not too awful, since we're taking the min, - * rather than averaging. - */ - tcp_vegas_rtt_calc(sk, seq_rtt * 1000); - /* We do the Vegas calculations only if we got enough RTT * samples that we can be reasonably sure that we got * at least one RTT sample that wasn't from a delayed ACK. @@ -236,8 +228,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, /* We don't have enough RTT samples to do the Vegas * calculation, so we'll behave like Reno. */ - if (tp->snd_cwnd > tp->snd_ssthresh) - tp->snd_cwnd++; + tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); } else { u32 rtt, target_cwnd, diff; @@ -275,7 +266,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, */ diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; - if (tp->snd_cwnd < tp->snd_ssthresh) { + if (tp->snd_cwnd <= tp->snd_ssthresh) { /* Slow start. */ if (diff > gamma) { /* Going too fast. Time to slow down @@ -295,6 +286,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, V_PARAM_SHIFT)+1); } + tcp_slow_start(tp); } else { /* Congestion avoidance. */ u32 next_snd_cwnd; @@ -327,37 +319,17 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, else if (next_snd_cwnd < tp->snd_cwnd) tp->snd_cwnd--; } + + if (tp->snd_cwnd < 2) + tp->snd_cwnd = 2; + else if (tp->snd_cwnd > tp->snd_cwnd_clamp) + tp->snd_cwnd = tp->snd_cwnd_clamp; } /* Wipe the slate clean for the next RTT. */ vegas->cntRTT = 0; vegas->minRTT = 0x7fffffff; } - - /* The following code is executed for every ack we receive, - * except for conditions checked in should_advance_cwnd() - * before the call to tcp_cong_avoid(). Mainly this means that - * we only execute this code if the ack actually acked some - * data. - */ - - /* If we are in slow start, increase our cwnd in response to this ACK. - * (If we are not in slow start then we are in congestion avoidance, - * and adjust our congestion window only once per RTT. See the code - * above.) - */ - if (tp->snd_cwnd <= tp->snd_ssthresh) - tp->snd_cwnd++; - - /* to keep cwnd from growing without bound */ - tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); - - /* Make sure that we are never so timid as to reduce our cwnd below - * 2 MSS. - * - * Going below 2 MSS would risk huge delayed ACKs from our receiver. - */ - tp->snd_cwnd = max(tp->snd_cwnd, 2U); } /* Extract info for Tcp socket info provided via netlink. */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e0bd1013cb0..2422a5f7195 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -761,7 +761,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) static __inline__ int __udp_checksum_complete(struct sk_buff *skb) { - return (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); + return __skb_checksum_complete(skb); } static __inline__ int udp_checksum_complete(struct sk_buff *skb) @@ -1100,11 +1100,8 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, if (uh->check == 0) { skb->ip_summed = CHECKSUM_UNNECESSARY; } else if (skb->ip_summed == CHECKSUM_HW) { - skb->ip_summed = CHECKSUM_UNNECESSARY; if (!udp_check(uh, ulen, saddr, daddr, skb->csum)) - return 0; - LIMIT_NETDEBUG(KERN_DEBUG "udp v4 hw csum failure.\n"); - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = CHECKSUM_UNNECESSARY; } if (skb->ip_summed != CHECKSUM_UNNECESSARY) skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index b2b60f3e9cd..42196ba3b0b 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -182,6 +182,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl) case IPPROTO_UDP: case IPPROTO_TCP: case IPPROTO_SCTP: + case IPPROTO_DCCP: if (pskb_may_pull(skb, xprth + 4 - skb->data)) { u16 *ports = (u16 *)xprth; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index ddcf7754eec..a60585fd85a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -137,6 +137,7 @@ static int addrconf_ifdown(struct net_device *dev, int how); static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags); static void addrconf_dad_timer(unsigned long data); static void addrconf_dad_completed(struct inet6_ifaddr *ifp); +static void addrconf_dad_run(struct inet6_dev *idev); static void addrconf_rs_timer(unsigned long data); static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); @@ -379,8 +380,8 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev) dev->type == ARPHRD_NONE || dev->type == ARPHRD_SIT) { printk(KERN_INFO - "Disabled Privacy Extensions on device %p(%s)\n", - dev, dev->name); + "%s: Disabled Privacy Extensions\n", + dev->name); ndev->cnf.use_tempaddr = -1; } else { in6_dev_hold(ndev); @@ -388,6 +389,9 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev) } #endif + if (netif_carrier_ok(dev)) + ndev->if_flags |= IF_READY; + write_lock_bh(&addrconf_lock); dev->ip6_ptr = ndev; write_unlock_bh(&addrconf_lock); @@ -415,6 +419,7 @@ static struct inet6_dev * ipv6_find_idev(struct net_device *dev) if ((idev = ipv6_add_dev(dev)) == NULL) return NULL; } + if (dev->flags&IFF_UP) ipv6_mc_up(idev); return idev; @@ -634,8 +639,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp) } #endif - for (ifap = &idev->addr_list; (ifa=*ifap) != NULL; - ifap = &ifa->if_next) { + for (ifap = &idev->addr_list; (ifa=*ifap) != NULL;) { if (ifa == ifp) { *ifap = ifa->if_next; __in6_ifa_put(ifp); @@ -643,6 +647,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp) if (!(ifp->flags & IFA_F_PERMANENT) || onlink > 0) break; deleted = 1; + continue; } else if (ifp->flags & IFA_F_PERMANENT) { if (ipv6_prefix_equal(&ifa->addr, &ifp->addr, ifp->prefix_len)) { @@ -666,6 +671,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp) } } } + ifap = &ifa->if_next; } write_unlock_bh(&idev->lock); @@ -903,11 +909,18 @@ int ipv6_dev_get_saddr(struct net_device *daddr_dev, score.addr_type = __ipv6_addr_type(&ifa->addr); - /* Rule 0: Candidate Source Address (section 4) + /* Rule 0: + * - Tentative Address (RFC2462 section 5.4) + * - A tentative address is not considered + * "assigned to an interface" in the traditional + * sense. + * - Candidate Source Address (section 4) * - In any case, anycast addresses, multicast * addresses, and the unspecified address MUST * NOT be included in a candidate set. */ + if (ifa->flags & IFA_F_TENTATIVE) + continue; if (unlikely(score.addr_type == IPV6_ADDR_ANY || score.addr_type & IPV6_ADDR_MULTICAST)) { LIMIT_NETDEBUG(KERN_DEBUG @@ -985,6 +998,8 @@ int ipv6_dev_get_saddr(struct net_device *daddr_dev, } /* Rule 4: Prefer home address -- not implemented yet */ + if (hiscore.rule < 4) + hiscore.rule++; /* Rule 5: Prefer outgoing interface */ if (hiscore.rule < 5) { @@ -1045,9 +1060,10 @@ int ipv6_dev_get_saddr(struct net_device *daddr_dev, } #endif /* Rule 8: Use longest matching prefix */ - if (hiscore.rule < 8) + if (hiscore.rule < 8) { hiscore.matchlen = ipv6_addr_diff(&ifa_result->addr, daddr); - score.rule++; + hiscore.rule++; + } score.matchlen = ipv6_addr_diff(&ifa->addr, daddr); if (score.matchlen > hiscore.matchlen) { score.rule = 8; @@ -1212,10 +1228,8 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) /* Gets referenced address, destroys ifaddr */ -void addrconf_dad_failure(struct inet6_ifaddr *ifp) +void addrconf_dad_stop(struct inet6_ifaddr *ifp) { - if (net_ratelimit()) - printk(KERN_INFO "%s: duplicate address detected!\n", ifp->idev->dev->name); if (ifp->flags&IFA_F_PERMANENT) { spin_lock_bh(&ifp->lock); addrconf_del_timer(ifp); @@ -1241,6 +1255,12 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp) ipv6_del_addr(ifp); } +void addrconf_dad_failure(struct inet6_ifaddr *ifp) +{ + if (net_ratelimit()) + printk(KERN_INFO "%s: duplicate address detected!\n", ifp->idev->dev->name); + addrconf_dad_stop(ifp); +} /* Join to solicited addr multicast group. */ @@ -1593,9 +1613,17 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len) not good. */ if (valid_lft >= 0x7FFFFFFF/HZ) - rt_expires = 0; + rt_expires = 0x7FFFFFFF - (0x7FFFFFFF % HZ); else - rt_expires = jiffies + valid_lft * HZ; + rt_expires = valid_lft * HZ; + + /* + * We convert this (in jiffies) to clock_t later. + * Avoid arithmetic overflow there as well. + * Overflow can happen only if HZ < USER_HZ. + */ + if (HZ < USER_HZ && rt_expires > 0x7FFFFFFF / USER_HZ) + rt_expires = 0x7FFFFFFF / USER_HZ; if (pinfo->onlink) { struct rt6_info *rt; @@ -1607,12 +1635,12 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len) ip6_del_rt(rt, NULL, NULL, NULL); rt = NULL; } else { - rt->rt6i_expires = rt_expires; + rt->rt6i_expires = jiffies + rt_expires; } } } else if (valid_lft) { addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len, - dev, rt_expires, RTF_ADDRCONF|RTF_EXPIRES|RTF_PREFIX_RT); + dev, jiffies_to_clock_t(rt_expires), RTF_ADDRCONF|RTF_EXPIRES|RTF_PREFIX_RT); } if (rt) dst_release(&rt->u.dst); @@ -2122,9 +2150,42 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, { struct net_device *dev = (struct net_device *) data; struct inet6_dev *idev = __in6_dev_get(dev); + int run_pending = 0; switch(event) { case NETDEV_UP: + case NETDEV_CHANGE: + if (event == NETDEV_UP) { + if (!netif_carrier_ok(dev)) { + /* device is not ready yet. */ + printk(KERN_INFO + "ADDRCONF(NETDEV_UP): %s: " + "link is not ready\n", + dev->name); + break; + } + } else { + if (!netif_carrier_ok(dev)) { + /* device is still not ready. */ + break; + } + + if (idev) { + if (idev->if_flags & IF_READY) { + /* device is already configured. */ + break; + } + idev->if_flags |= IF_READY; + } + + printk(KERN_INFO + "ADDRCONF(NETDEV_CHANGE): %s: " + "link becomes ready\n", + dev->name); + + run_pending = 1; + } + switch(dev->type) { case ARPHRD_SIT: addrconf_sit_config(dev); @@ -2141,6 +2202,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, break; }; if (idev) { + if (run_pending) + addrconf_dad_run(idev); + /* If the MTU changed during the interface down, when the interface up, the changed MTU must be reflected in the idev as well as routers. @@ -2175,8 +2239,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, */ addrconf_ifdown(dev, event != NETDEV_DOWN); break; - case NETDEV_CHANGE: - break; + case NETDEV_CHANGENAME: #ifdef CONFIG_SYSCTL if (idev) { @@ -2257,7 +2320,7 @@ static int addrconf_ifdown(struct net_device *dev, int how) /* Step 3: clear flags for stateless addrconf */ if (how != 1) - idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD); + idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD|IF_READY); /* Step 4: clear address list */ #ifdef CONFIG_IPV6_PRIVACY @@ -2366,11 +2429,20 @@ out: /* * Duplicate Address Detection */ +static void addrconf_dad_kick(struct inet6_ifaddr *ifp) +{ + unsigned long rand_num; + struct inet6_dev *idev = ifp->idev; + + rand_num = net_random() % (idev->cnf.rtr_solicit_delay ? : 1); + ifp->probes = idev->cnf.dad_transmits; + addrconf_mod_timer(ifp, AC_DAD, rand_num); +} + static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags) { struct inet6_dev *idev = ifp->idev; struct net_device *dev = idev->dev; - unsigned long rand_num; addrconf_join_solict(dev, &ifp->addr); @@ -2379,7 +2451,6 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags) flags); net_srandom(ifp->addr.s6_addr32[3]); - rand_num = net_random() % (idev->cnf.rtr_solicit_delay ? : 1); read_lock_bh(&idev->lock); if (ifp->dead) @@ -2396,9 +2467,19 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags) return; } - ifp->probes = idev->cnf.dad_transmits; - addrconf_mod_timer(ifp, AC_DAD, rand_num); - + if (!(idev->if_flags & IF_READY)) { + spin_unlock_bh(&ifp->lock); + read_unlock_bh(&idev->lock); + /* + * If the defice is not ready: + * - keep it tentative if it is a permanent address. + * - otherwise, kill it. + */ + in6_ifa_hold(ifp); + addrconf_dad_stop(ifp); + return; + } + addrconf_dad_kick(ifp); spin_unlock_bh(&ifp->lock); out: read_unlock_bh(&idev->lock); @@ -2481,6 +2562,22 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp) } } +static void addrconf_dad_run(struct inet6_dev *idev) { + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + for (ifp = idev->addr_list; ifp; ifp = ifp->if_next) { + spin_lock_bh(&ifp->lock); + if (!(ifp->flags & IFA_F_TENTATIVE)) { + spin_unlock_bh(&ifp->lock); + continue; + } + spin_unlock_bh(&ifp->lock); + addrconf_dad_kick(ifp); + } + read_unlock_bh(&idev->lock); +} + #ifdef CONFIG_PROC_FS struct if6_iter_state { int bucket; @@ -2626,7 +2723,7 @@ static void addrconf_verify(unsigned long foo) for (i=0; i < IN6_ADDR_HSIZE; i++) { restart: - write_lock(&addrconf_hash_lock); + read_lock(&addrconf_hash_lock); for (ifp=inet6_addr_lst[i]; ifp; ifp=ifp->lst_next) { unsigned long age; #ifdef CONFIG_IPV6_PRIVACY @@ -2648,7 +2745,7 @@ restart: if (age >= ifp->valid_lft) { spin_unlock(&ifp->lock); in6_ifa_hold(ifp); - write_unlock(&addrconf_hash_lock); + read_unlock(&addrconf_hash_lock); ipv6_del_addr(ifp); goto restart; } else if (age >= ifp->prefered_lft) { @@ -2667,7 +2764,7 @@ restart: if (deprecate) { in6_ifa_hold(ifp); - write_unlock(&addrconf_hash_lock); + read_unlock(&addrconf_hash_lock); ipv6_ifa_notify(0, ifp); in6_ifa_put(ifp); @@ -2685,7 +2782,10 @@ restart: in6_ifa_hold(ifp); in6_ifa_hold(ifpub); spin_unlock(&ifp->lock); - write_unlock(&addrconf_hash_lock); + read_unlock(&addrconf_hash_lock); + spin_lock(&ifpub->lock); + ifpub->regen_count = 0; + spin_unlock(&ifpub->lock); ipv6_create_tempaddr(ifpub, ifp); in6_ifa_put(ifpub); in6_ifa_put(ifp); @@ -2702,7 +2802,7 @@ restart: spin_unlock(&ifp->lock); } } - write_unlock(&addrconf_hash_lock); + read_unlock(&addrconf_hash_lock); } addr_chk_timer.expires = time_before(next, jiffies + HZ) ? jiffies + HZ : next; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 4f8795af2ed..d9546380fa0 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -92,10 +92,13 @@ static int inet6_create(struct socket *sock, int protocol) struct proto *answer_prot; unsigned char answer_flags; char answer_no_check; - int rc; + int try_loading_module = 0; + int err; /* Look for the requested type/protocol pair. */ answer = NULL; +lookup_protocol: + err = -ESOCKTNOSUPPORT; rcu_read_lock(); list_for_each_rcu(p, &inetsw6[sock->type]) { answer = list_entry(p, struct inet_protosw, list); @@ -113,21 +116,37 @@ static int inet6_create(struct socket *sock, int protocol) if (IPPROTO_IP == answer->protocol) break; } + err = -EPROTONOSUPPORT; answer = NULL; } - rc = -ESOCKTNOSUPPORT; - if (!answer) - goto out_rcu_unlock; - rc = -EPERM; + if (!answer) { + if (try_loading_module < 2) { + rcu_read_unlock(); + /* + * Be more specific, e.g. net-pf-10-proto-132-type-1 + * (net-pf-PF_INET6-proto-IPPROTO_SCTP-type-SOCK_STREAM) + */ + if (++try_loading_module == 1) + request_module("net-pf-%d-proto-%d-type-%d", + PF_INET6, protocol, sock->type); + /* + * Fall back to generic, e.g. net-pf-10-proto-132 + * (net-pf-PF_INET6-proto-IPPROTO_SCTP) + */ + else + request_module("net-pf-%d-proto-%d", + PF_INET6, protocol); + goto lookup_protocol; + } else + goto out_rcu_unlock; + } + + err = -EPERM; if (answer->capability > 0 && !capable(answer->capability)) goto out_rcu_unlock; - rc = -EPROTONOSUPPORT; - if (!protocol) - goto out_rcu_unlock; sock->ops = answer->ops; - answer_prot = answer->prot; answer_no_check = answer->no_check; answer_flags = answer->flags; @@ -135,14 +154,14 @@ static int inet6_create(struct socket *sock, int protocol) BUG_TRAP(answer_prot->slab != NULL); - rc = -ENOBUFS; + err = -ENOBUFS; sk = sk_alloc(PF_INET6, GFP_KERNEL, answer_prot, 1); if (sk == NULL) goto out; sock_init_data(sock, sk); - rc = 0; + err = 0; sk->sk_no_check = answer_no_check; if (INET_PROTOSW_REUSE & answer_flags) sk->sk_reuse = 1; @@ -202,14 +221,14 @@ static int inet6_create(struct socket *sock, int protocol) sk->sk_prot->hash(sk); } if (sk->sk_prot->init) { - rc = sk->sk_prot->init(sk); - if (rc) { + err = sk->sk_prot->init(sk); + if (err) { sk_common_release(sk); goto out; } } out: - return rc; + return err; out_rcu_unlock: rcu_read_unlock(); goto out; @@ -699,12 +718,14 @@ static int __init inet6_init(void) /* Register the family here so that the init calls below will * be able to create sockets. (?? is this dangerous ??) */ - (void) sock_register(&inet6_family_ops); + err = sock_register(&inet6_family_ops); + if (err) + goto out_unregister_raw_proto; /* Initialise ipv6 mibs */ err = init_ipv6_mibs(); if (err) - goto out_unregister_raw_proto; + goto out_unregister_sock; /* * ipngwg API draft makes clear that the correct semantics @@ -796,6 +817,8 @@ icmp_fail: ipv6_sysctl_unregister(); #endif cleanup_ipv6_mibs(); +out_unregister_sock: + sock_unregister(PF_INET6); out_unregister_raw_proto: proto_unregister(&rawv6_prot); out_unregister_udp_proto: diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index cc518405b3e..c4a3a993acb 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -437,7 +437,7 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) break; case IPPROTO_AH: nexthdr = ptr[0]; - len = (ptr[1] + 1) << 2; + len = (ptr[1] + 2) << 2; break; default: nexthdr = ptr[0]; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 40d9a1935ab..8bfbe997079 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -248,7 +248,7 @@ static u32 esp6_get_max_size(struct xfrm_state *x, int mtu) if (esp->conf.padlen) mtu = ALIGN(mtu, esp->conf.padlen); - return mtu + x->props.header_len + esp->auth.icv_full_len; + return mtu + x->props.header_len + esp->auth.icv_trunc_len; } static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 922549581ab..be6faf31138 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -628,6 +628,7 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, if (!tot_len) return NULL; + tot_len += sizeof(*opt2); opt2 = sock_kmalloc(sk, tot_len, GFP_ATOMIC); if (!opt2) return ERR_PTR(-ENOBUFS); @@ -668,7 +669,26 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, return opt2; out: - sock_kfree_s(sk, p, tot_len); + sock_kfree_s(sk, opt2, opt2->tot_len); return ERR_PTR(err); } +struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, + struct ipv6_txoptions *opt) +{ + /* + * ignore the dest before srcrt unless srcrt is being included. + * --yoshfuji + */ + if (opt && opt->dst0opt && !opt->srcrt) { + if (opt_space != opt) { + memcpy(opt_space, opt, sizeof(*opt_space)); + opt = opt_space; + } + opt->opt_nflen -= ipv6_optlen(opt->dst0opt); + opt->dst0opt = NULL; + } + + return opt; +} + diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 23e540365a1..6ec6a2b549b 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -328,8 +328,10 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, iif = skb->dev->ifindex; /* - * Must not send if we know that source is Anycast also. - * for now we don't know that. + * Must not send error if the source does not uniquely + * identify a single node (RFC2463 Section 2.4). + * We check unspecified / multicast addresses here, + * and anycast addresses will be checked later. */ if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) { LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: addr_any/mcast source\n"); @@ -373,6 +375,16 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, err = ip6_dst_lookup(sk, &dst, &fl); if (err) goto out; + + /* + * We won't send icmp if the destination is known + * anycast. + */ + if (((struct rt6_info *)dst)->rt6i_flags & RTF_ANYCAST) { + LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: acast source\n"); + goto out_dst_release; + } + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) goto out; @@ -585,17 +597,16 @@ static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) daddr = &skb->nh.ipv6h->daddr; /* Perform checksum. */ - if (skb->ip_summed == CHECKSUM_HW) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, - skb->csum)) { - LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 hw checksum failed\n"); - skb->ip_summed = CHECKSUM_NONE; - } - } - if (skb->ip_summed == CHECKSUM_NONE) { - if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, - skb_checksum(skb, 0, skb->len, 0))) { + switch (skb->ip_summed) { + case CHECKSUM_HW: + if (!csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, + skb->csum)) + break; + /* fall through */ + case CHECKSUM_NONE: + skb->csum = ~csum_ipv6_magic(saddr, daddr, skb->len, + IPPROTO_ICMPV6, 0); + if (__skb_checksum_complete(skb)) { LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n", NIP6(*saddr), NIP6(*daddr)); goto discard_it; @@ -752,7 +763,7 @@ void icmpv6_cleanup(void) inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6); } -static struct icmp6_err { +static const struct icmp6_err { int err; int fatal; } tab_unreach[] = { diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index bbbe80cdaf7..1cf02765fb5 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -225,20 +225,16 @@ struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions * opt_space, struct ip6_flowlabel * fl, struct ipv6_txoptions * fopt) { - struct ipv6_txoptions * fl_opt = fl ? fl->opt : NULL; - - if (fopt == NULL || fopt->opt_flen == 0) { - if (!fl_opt || !fl_opt->dst0opt || fl_opt->srcrt) - return fl_opt; - } - + struct ipv6_txoptions * fl_opt = fl->opt; + + if (fopt == NULL || fopt->opt_flen == 0) + return fl_opt; + if (fl_opt != NULL) { opt_space->hopopt = fl_opt->hopopt; - opt_space->dst0opt = fl_opt->srcrt ? fl_opt->dst0opt : NULL; + opt_space->dst0opt = fl_opt->dst0opt; opt_space->srcrt = fl_opt->srcrt; opt_space->opt_nflen = fl_opt->opt_nflen; - if (fl_opt->dst0opt && !fl_opt->srcrt) - opt_space->opt_nflen -= ipv6_optlen(fl_opt->dst0opt); } else { if (fopt->opt_nflen == 0) return fopt; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index c1fa693511a..8523c76ebf7 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -774,7 +774,8 @@ out_err_release: *dst = NULL; return err; } -inline int ip6_ufo_append_data(struct sock *sk, + +static inline int ip6_ufo_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int hh_len, int fragheaderlen, diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 003fd99ff59..3620718defe 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -287,7 +287,7 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, { struct ipv6_txoptions *opt; if (optlen == 0) - optval = 0; + optval = NULL; /* hop-by-hop / destination options are privileged option */ retv = -EPERM; @@ -628,8 +628,8 @@ e_inval: return -EINVAL; } -int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_opt_hdr *hdr, - char __user *optval, int len) +static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_opt_hdr *hdr, + char __user *optval, int len) { if (!hdr) return 0; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index f15e04ad026..f829a4ad3cc 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -170,7 +170,7 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, #define MLDV2_QQIC(value) MLDV2_EXP(0x80, 4, 3, value) #define MLDV2_MRC(value) MLDV2_EXP(0x8000, 12, 3, value) -#define IPV6_MLD_MAX_MSF 10 +#define IPV6_MLD_MAX_MSF 64 int sysctl_mld_max_msf = IPV6_MLD_MAX_MSF; @@ -224,6 +224,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr) mc_lst->ifindex = dev->ifindex; mc_lst->sfmode = MCAST_EXCLUDE; + mc_lst->sflock = RW_LOCK_UNLOCKED; mc_lst->sflist = NULL; /* @@ -360,6 +361,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, struct ip6_sf_socklist *psl; int i, j, rv; int leavegroup = 0; + int pmclocked = 0; int err; if (pgsr->gsr_group.ss_family != AF_INET6 || @@ -403,6 +405,9 @@ int ip6_mc_source(int add, int omode, struct sock *sk, pmc->sfmode = omode; } + write_lock_bh(&pmc->sflock); + pmclocked = 1; + psl = pmc->sflist; if (!add) { if (!psl) @@ -475,6 +480,8 @@ int ip6_mc_source(int add, int omode, struct sock *sk, /* update the interface list */ ip6_mc_add_src(idev, group, omode, 1, source, 1); done: + if (pmclocked) + write_unlock_bh(&pmc->sflock); read_unlock_bh(&ipv6_sk_mc_lock); read_unlock_bh(&idev->lock); in6_dev_put(idev); @@ -510,6 +517,8 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) dev = idev->dev; err = 0; + read_lock_bh(&ipv6_sk_mc_lock); + if (gsf->gf_fmode == MCAST_INCLUDE && gsf->gf_numsrc == 0) { leavegroup = 1; goto done; @@ -549,6 +558,8 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) newpsl = NULL; (void) ip6_mc_add_src(idev, group, gsf->gf_fmode, 0, NULL, 0); } + + write_lock_bh(&pmc->sflock); psl = pmc->sflist; if (psl) { (void) ip6_mc_del_src(idev, group, pmc->sfmode, @@ -558,8 +569,10 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) (void) ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); pmc->sflist = newpsl; pmc->sfmode = gsf->gf_fmode; + write_unlock_bh(&pmc->sflock); err = 0; done: + read_unlock_bh(&ipv6_sk_mc_lock); read_unlock_bh(&idev->lock); in6_dev_put(idev); dev_put(dev); @@ -592,6 +605,11 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, dev = idev->dev; err = -EADDRNOTAVAIL; + /* + * changes to the ipv6_mc_list require the socket lock and + * a read lock on ip6_sk_mc_lock. We have the socket lock, + * so reading the list is safe. + */ for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) { if (pmc->ifindex != gsf->gf_interface) @@ -614,6 +632,10 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) { return -EFAULT; } + /* changes to psl require the socket lock, a read lock on + * on ipv6_sk_mc_lock and a write lock on pmc->sflock. We + * have the socket lock, so reading here is safe. + */ for (i=0; i<copycount; i++) { struct sockaddr_in6 *psin6; struct sockaddr_storage ss; @@ -650,6 +672,7 @@ int inet6_mc_check(struct sock *sk, struct in6_addr *mc_addr, read_unlock(&ipv6_sk_mc_lock); return 1; } + read_lock(&mc->sflock); psl = mc->sflist; if (!psl) { rv = mc->sfmode == MCAST_EXCLUDE; @@ -665,6 +688,7 @@ int inet6_mc_check(struct sock *sk, struct in6_addr *mc_addr, if (mc->sfmode == MCAST_EXCLUDE && i < psl->sl_count) rv = 0; } + read_unlock(&mc->sflock); read_unlock(&ipv6_sk_mc_lock); return rv; @@ -1068,23 +1092,64 @@ static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime) ma->mca_flags |= MAF_TIMER_RUNNING; } -static void mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, +/* mark EXCLUDE-mode sources */ +static int mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs, + struct in6_addr *srcs) +{ + struct ip6_sf_list *psf; + int i, scount; + + scount = 0; + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + if (scount == nsrcs) + break; + for (i=0; i<nsrcs; i++) { + /* skip inactive filters */ + if (pmc->mca_sfcount[MCAST_INCLUDE] || + pmc->mca_sfcount[MCAST_EXCLUDE] != + psf->sf_count[MCAST_EXCLUDE]) + continue; + if (ipv6_addr_equal(&srcs[i], &psf->sf_addr)) { + scount++; + break; + } + } + } + pmc->mca_flags &= ~MAF_GSQUERY; + if (scount == nsrcs) /* all sources excluded */ + return 0; + return 1; +} + +static int mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, struct in6_addr *srcs) { struct ip6_sf_list *psf; int i, scount; + if (pmc->mca_sfmode == MCAST_EXCLUDE) + return mld_xmarksources(pmc, nsrcs, srcs); + + /* mark INCLUDE-mode sources */ + scount = 0; for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { if (scount == nsrcs) break; - for (i=0; i<nsrcs; i++) + for (i=0; i<nsrcs; i++) { if (ipv6_addr_equal(&srcs[i], &psf->sf_addr)) { psf->sf_gsresp = 1; scount++; break; } + } + } + if (!scount) { + pmc->mca_flags &= ~MAF_GSQUERY; + return 0; } + pmc->mca_flags |= MAF_GSQUERY; + return 1; } int igmp6_event_query(struct sk_buff *skb) @@ -1167,7 +1232,7 @@ int igmp6_event_query(struct sk_buff *skb) /* mark sources to include, if group & source-specific */ if (mlh2->nsrcs != 0) { if (!pskb_may_pull(skb, srcs_offset + - mlh2->nsrcs * sizeof(struct in6_addr))) { + ntohs(mlh2->nsrcs) * sizeof(struct in6_addr))) { in6_dev_put(idev); return -EINVAL; } @@ -1203,10 +1268,9 @@ int igmp6_event_query(struct sk_buff *skb) else ma->mca_flags &= ~MAF_GSQUERY; } - if (ma->mca_flags & MAF_GSQUERY) - mld_marksources(ma, ntohs(mlh2->nsrcs), - mlh2->srcs); - igmp6_group_queried(ma, max_delay); + if (!(ma->mca_flags & MAF_GSQUERY) || + mld_marksources(ma, ntohs(mlh2->nsrcs), mlh2->srcs)) + igmp6_group_queried(ma, max_delay); spin_unlock_bh(&ma->mca_lock); if (group_type != IPV6_ADDR_ANY) break; @@ -1231,6 +1295,11 @@ int igmp6_event_report(struct sk_buff *skb) if (skb->pkt_type == PACKET_LOOPBACK) return 0; + /* send our report if the MC router may not have heard this report */ + if (skb->pkt_type != PACKET_MULTICAST && + skb->pkt_type != PACKET_BROADCAST) + return 0; + if (!pskb_may_pull(skb, sizeof(struct in6_addr))) return -EINVAL; @@ -1276,7 +1345,18 @@ static int is_in(struct ifmcaddr6 *pmc, struct ip6_sf_list *psf, int type, case MLD2_MODE_IS_EXCLUDE: if (gdeleted || sdeleted) return 0; - return !((pmc->mca_flags & MAF_GSQUERY) && !psf->sf_gsresp); + if (!((pmc->mca_flags & MAF_GSQUERY) && !psf->sf_gsresp)) { + if (pmc->mca_sfmode == MCAST_INCLUDE) + return 1; + /* don't include if this source is excluded + * in all filters + */ + if (psf->sf_count[MCAST_INCLUDE]) + return 0; + return pmc->mca_sfcount[MCAST_EXCLUDE] == + psf->sf_count[MCAST_EXCLUDE]; + } + return 0; case MLD2_CHANGE_TO_INCLUDE: if (gdeleted || sdeleted) return 0; @@ -1445,7 +1525,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, struct mld2_report *pmr; struct mld2_grec *pgr = NULL; struct ip6_sf_list *psf, *psf_next, *psf_prev, **psf_list; - int scount, first, isquery, truncate; + int scount, stotal, first, isquery, truncate; if (pmc->mca_flags & MAF_NOREPORT) return skb; @@ -1455,25 +1535,13 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, truncate = type == MLD2_MODE_IS_EXCLUDE || type == MLD2_CHANGE_TO_EXCLUDE; + stotal = scount = 0; + psf_list = sdeleted ? &pmc->mca_tomb : &pmc->mca_sources; - if (!*psf_list) { - if (type == MLD2_ALLOW_NEW_SOURCES || - type == MLD2_BLOCK_OLD_SOURCES) - return skb; - if (pmc->mca_crcount || isquery) { - /* make sure we have room for group header and at - * least one source. - */ - if (skb && AVAILABLE(skb) < sizeof(struct mld2_grec)+ - sizeof(struct in6_addr)) { - mld_sendpack(skb); - skb = NULL; /* add_grhead will get a new one */ - } - skb = add_grhead(skb, pmc, type, &pgr); - } - return skb; - } + if (!*psf_list) + goto empty_source; + pmr = skb ? (struct mld2_report *)skb->h.raw : NULL; /* EX and TO_EX get a fresh packet, if needed */ @@ -1486,7 +1554,6 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, } } first = 1; - scount = 0; psf_prev = NULL; for (psf=*psf_list; psf; psf=psf_next) { struct in6_addr *psrc; @@ -1520,7 +1587,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, } psrc = (struct in6_addr *)skb_put(skb, sizeof(*psrc)); *psrc = psf->sf_addr; - scount++; + scount++; stotal++; if ((type == MLD2_ALLOW_NEW_SOURCES || type == MLD2_BLOCK_OLD_SOURCES) && psf->sf_crcount) { psf->sf_crcount--; @@ -1535,6 +1602,21 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, } psf_prev = psf; } + +empty_source: + if (!stotal) { + if (type == MLD2_ALLOW_NEW_SOURCES || + type == MLD2_BLOCK_OLD_SOURCES) + return skb; + if (pmc->mca_crcount || isquery) { + /* make sure we have room for group header */ + if (skb && AVAILABLE(skb) < sizeof(struct mld2_grec)) { + mld_sendpack(skb); + skb = NULL; /* add_grhead will get a new one */ + } + skb = add_grhead(skb, pmc, type, &pgr); + } + } if (pgr) pgr->grec_nsrcs = htons(scount); @@ -1616,11 +1698,11 @@ static void mld_send_cr(struct inet6_dev *idev) skb = add_grec(skb, pmc, dtype, 1, 1); } if (pmc->mca_crcount) { - pmc->mca_crcount--; if (pmc->mca_sfmode == MCAST_EXCLUDE) { type = MLD2_CHANGE_TO_INCLUDE; skb = add_grec(skb, pmc, type, 1, 0); } + pmc->mca_crcount--; if (pmc->mca_crcount == 0) { mld_clear_zeros(&pmc->mca_tomb); mld_clear_zeros(&pmc->mca_sources); @@ -1654,12 +1736,12 @@ static void mld_send_cr(struct inet6_dev *idev) /* filter mode changes */ if (pmc->mca_crcount) { - pmc->mca_crcount--; if (pmc->mca_sfmode == MCAST_EXCLUDE) type = MLD2_CHANGE_TO_EXCLUDE; else type = MLD2_CHANGE_TO_INCLUDE; skb = add_grec(skb, pmc, type, 0, 0); + pmc->mca_crcount--; } spin_unlock_bh(&pmc->mca_lock); } @@ -2018,6 +2100,9 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, { int err; + /* callers have the socket lock and a write lock on ipv6_sk_mc_lock, + * so no other readers or writers of iml or its sflist + */ if (iml->sflist == 0) { /* any-source empty exclude case */ return ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0); diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 971ba60bf6e..04912f9b35c 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -5,10 +5,20 @@ menu "IPv6: Netfilter Configuration (EXPERIMENTAL)" depends on INET && IPV6 && NETFILTER && EXPERIMENTAL -#tristate 'Connection tracking (required for masq/NAT)' CONFIG_IP6_NF_CONNTRACK -#if [ "$CONFIG_IP6_NF_CONNTRACK" != "n" ]; then -# dep_tristate ' FTP protocol support' CONFIG_IP6_NF_FTP $CONFIG_IP6_NF_CONNTRACK -#fi +config NF_CONNTRACK_IPV6 + tristate "IPv6 support for new connection tracking (EXPERIMENTAL)" + depends on EXPERIMENTAL && NF_CONNTRACK + ---help--- + Connection tracking keeps a record of what packets have passed + through your machine, in order to figure out how they are related + into connections. + + This is IPv6 support on Layer 3 independent connection tracking. + Layer 3 independent connection tracking is experimental scheme + which generalize ip_conntrack to support other layer 3 protocols. + + To compile it as a module, choose M here. If unsure, say N. + config IP6_NF_QUEUE tristate "IP6 Userspace queueing via NETLINK (OBSOLETE)" ---help--- @@ -114,7 +124,6 @@ config IP6_NF_MATCH_OWNER To compile it as a module, choose M here. If unsure, say N. -# dep_tristate ' MAC address match support' CONFIG_IP6_NF_MATCH_MAC $CONFIG_IP6_NF_IPTABLES config IP6_NF_MATCH_MARK tristate "netfilter MARK match support" depends on IP6_NF_IPTABLES @@ -170,15 +179,6 @@ config IP6_NF_MATCH_PHYSDEV To compile it as a module, choose M here. If unsure, say N. -# dep_tristate ' Multiple port match support' CONFIG_IP6_NF_MATCH_MULTIPORT $CONFIG_IP6_NF_IPTABLES -# dep_tristate ' TOS match support' CONFIG_IP6_NF_MATCH_TOS $CONFIG_IP6_NF_IPTABLES -# if [ "$CONFIG_IP6_NF_CONNTRACK" != "n" ]; then -# dep_tristate ' Connection state match support' CONFIG_IP6_NF_MATCH_STATE $CONFIG_IP6_NF_CONNTRACK $CONFIG_IP6_NF_IPTABLES -# fi -# if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then -# dep_tristate ' Unclean match support (EXPERIMENTAL)' CONFIG_IP6_NF_MATCH_UNCLEAN $CONFIG_IP6_NF_IPTABLES -# dep_tristate ' Owner match support (EXPERIMENTAL)' CONFIG_IP6_NF_MATCH_OWNER $CONFIG_IP6_NF_IPTABLES -# fi # The targets config IP6_NF_FILTER tristate "Packet filtering" @@ -211,7 +211,7 @@ config IP6_NF_TARGET_REJECT config IP6_NF_TARGET_NFQUEUE tristate "NFQUEUE Target Support" - depends on IP_NF_IPTABLES + depends on IP6_NF_IPTABLES help This Target replaced the old obsolete QUEUE target. @@ -220,12 +220,6 @@ config IP6_NF_TARGET_NFQUEUE To compile it as a module, choose M here. If unsure, say N. -# if [ "$CONFIG_IP6_NF_FILTER" != "n" ]; then -# dep_tristate ' REJECT target support' CONFIG_IP6_NF_TARGET_REJECT $CONFIG_IP6_NF_FILTER -# if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then -# dep_tristate ' MIRROR target support (EXPERIMENTAL)' CONFIG_IP6_NF_TARGET_MIRROR $CONFIG_IP6_NF_FILTER -# fi -# fi config IP6_NF_MANGLE tristate "Packet mangling" depends on IP6_NF_IPTABLES @@ -236,7 +230,6 @@ config IP6_NF_MANGLE To compile it as a module, choose M here. If unsure, say N. -# dep_tristate ' TOS target support' CONFIG_IP6_NF_TARGET_TOS $CONFIG_IP_NF_MANGLE config IP6_NF_TARGET_MARK tristate "MARK target support" depends on IP6_NF_MANGLE @@ -266,7 +259,6 @@ config IP6_NF_TARGET_HL To compile it as a module, choose M here. If unsure, say N. -#dep_tristate ' LOG target support' CONFIG_IP6_NF_TARGET_LOG $CONFIG_IP6_NF_IPTABLES config IP6_NF_RAW tristate 'raw table support (required for TRACE)' depends on IP6_NF_IPTABLES @@ -278,19 +270,5 @@ config IP6_NF_RAW If you want to compile it as a module, say M here and read <file:Documentation/modules.txt>. If unsure, say `N'. -config NF_CONNTRACK_IPV6 - tristate "IPv6 support for new connection tracking (EXPERIMENTAL)" - depends on EXPERIMENTAL && NF_CONNTRACK - ---help--- - Connection tracking keeps a record of what packets have passed - through your machine, in order to figure out how they are related - into connections. - - This is IPv6 support on Layer 3 independent connection tracking. - Layer 3 independent connection tracking is experimental scheme - which generalize ip_conntrack to support other layer 3 protocols. - - To compile it as a module, choose M here. If unsure, say N. - endmenu diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 7d492226c16..95d469271c4 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1972,7 +1972,7 @@ static int ip6t_get_matches(char *buffer, char **start, off_t offset, int length return pos; } -static struct { char *name; get_info_t *get_info; } ip6t_proc_entry[] = +static const struct { char *name; get_info_t *get_info; } ip6t_proc_entry[] = { { "ip6_tables_names", ip6t_get_tables }, { "ip6_tables_targets", ip6t_get_targets }, { "ip6_tables_matches", ip6t_get_matches }, diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index e2c90b3a807..753a3ae8502 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -339,8 +339,8 @@ extern unsigned long nf_ct_icmpv6_timeout; /* From nf_conntrack_frag6.c */ extern unsigned long nf_ct_frag6_timeout; -extern unsigned long nf_ct_frag6_low_thresh; -extern unsigned long nf_ct_frag6_high_thresh; +extern unsigned int nf_ct_frag6_low_thresh; +extern unsigned int nf_ct_frag6_high_thresh; static struct ctl_table_header *nf_ct_ipv6_sysctl_header; @@ -367,7 +367,7 @@ static ctl_table nf_ct_sysctl_table[] = { .data = &nf_ct_frag6_low_thresh, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = &proc_dointvec, }, { .ctl_name = NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, @@ -375,7 +375,7 @@ static ctl_table nf_ct_sysctl_table[] = { .data = &nf_ct_frag6_high_thresh, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = &proc_dointvec, }, { .ctl_name = 0 } }; diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index c0f1da5497a..a7e03cfacd0 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -68,8 +68,8 @@ static int icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple, [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_REPLY +1 }; - __u8 type = orig->dst.u.icmp.type - 128; - if (type >= sizeof(invmap) || !invmap[type]) + int type = orig->dst.u.icmp.type - 128; + if (type < 0 || type >= sizeof(invmap) || !invmap[type]) return 0; tuple->src.u.icmp.id = orig->src.u.icmp.id; @@ -129,12 +129,12 @@ static int icmpv6_new(struct nf_conn *conntrack, [ICMPV6_ECHO_REQUEST - 128] = 1, [ICMPV6_NI_QUERY - 128] = 1 }; + int type = conntrack->tuplehash[0].tuple.dst.u.icmp.type - 128; - if (conntrack->tuplehash[0].tuple.dst.u.icmp.type - 128 >= sizeof(valid_new) - || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type - 128]) { + if (type < 0 || type >= sizeof(valid_new) || !valid_new[type]) { /* Can't create a new ICMPv6 `conn' with this. */ - DEBUGP("icmp: can't create new conn with type %u\n", - conntrack->tuplehash[0].tuple.dst.u.icmp.type); + DEBUGP("icmpv6: can't create new conn with type %u\n", + type + 128); NF_CT_DUMP_TUPLE(&conntrack->tuplehash[0].tuple); return 0; } diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 7640b9bb769..c2c52af9e56 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -55,9 +55,9 @@ #define NF_CT_FRAG6_LOW_THRESH 196608 /* == 192*1024 */ #define NF_CT_FRAG6_TIMEOUT IPV6_FRAG_TIMEOUT -int nf_ct_frag6_high_thresh = 256*1024; -int nf_ct_frag6_low_thresh = 192*1024; -int nf_ct_frag6_timeout = IPV6_FRAG_TIMEOUT; +unsigned int nf_ct_frag6_high_thresh = 256*1024; +unsigned int nf_ct_frag6_low_thresh = 192*1024; +unsigned long nf_ct_frag6_timeout = IPV6_FRAG_TIMEOUT; struct nf_ct_frag6_skb_cb { @@ -190,8 +190,10 @@ static void nf_ct_frag6_secret_rebuild(unsigned long dummy) atomic_t nf_ct_frag6_mem = ATOMIC_INIT(0); /* Memory Tracking Functions. */ -static inline void frag_kfree_skb(struct sk_buff *skb) +static inline void frag_kfree_skb(struct sk_buff *skb, unsigned int *work) { + if (work) + *work -= skb->truesize; atomic_sub(skb->truesize, &nf_ct_frag6_mem); if (NFCT_FRAG6_CB(skb)->orig) kfree_skb(NFCT_FRAG6_CB(skb)->orig); @@ -199,8 +201,11 @@ static inline void frag_kfree_skb(struct sk_buff *skb) kfree_skb(skb); } -static inline void frag_free_queue(struct nf_ct_frag6_queue *fq) +static inline void frag_free_queue(struct nf_ct_frag6_queue *fq, + unsigned int *work) { + if (work) + *work -= sizeof(struct nf_ct_frag6_queue); atomic_sub(sizeof(struct nf_ct_frag6_queue), &nf_ct_frag6_mem); kfree(fq); } @@ -218,7 +223,8 @@ static inline struct nf_ct_frag6_queue *frag_alloc_queue(void) /* Destruction primitives. */ /* Complete destruction of fq. */ -static void nf_ct_frag6_destroy(struct nf_ct_frag6_queue *fq) +static void nf_ct_frag6_destroy(struct nf_ct_frag6_queue *fq, + unsigned int *work) { struct sk_buff *fp; @@ -230,17 +236,17 @@ static void nf_ct_frag6_destroy(struct nf_ct_frag6_queue *fq) while (fp) { struct sk_buff *xp = fp->next; - frag_kfree_skb(fp); + frag_kfree_skb(fp, work); fp = xp; } - frag_free_queue(fq); + frag_free_queue(fq, work); } -static __inline__ void fq_put(struct nf_ct_frag6_queue *fq) +static __inline__ void fq_put(struct nf_ct_frag6_queue *fq, unsigned int *work) { if (atomic_dec_and_test(&fq->refcnt)) - nf_ct_frag6_destroy(fq); + nf_ct_frag6_destroy(fq, work); } /* Kill fq entry. It is not destroyed immediately, @@ -262,16 +268,21 @@ static void nf_ct_frag6_evictor(void) { struct nf_ct_frag6_queue *fq; struct list_head *tmp; + unsigned int work; - for (;;) { - if (atomic_read(&nf_ct_frag6_mem) <= nf_ct_frag6_low_thresh) - return; + work = atomic_read(&nf_ct_frag6_mem); + if (work <= nf_ct_frag6_low_thresh) + return; + + work -= nf_ct_frag6_low_thresh; + while (work > 0) { read_lock(&nf_ct_frag6_lock); if (list_empty(&nf_ct_frag6_lru_list)) { read_unlock(&nf_ct_frag6_lock); return; } tmp = nf_ct_frag6_lru_list.next; + BUG_ON(tmp == NULL); fq = list_entry(tmp, struct nf_ct_frag6_queue, lru_list); atomic_inc(&fq->refcnt); read_unlock(&nf_ct_frag6_lock); @@ -281,7 +292,7 @@ static void nf_ct_frag6_evictor(void) fq_kill(fq); spin_unlock(&fq->lock); - fq_put(fq); + fq_put(fq, &work); } } @@ -298,7 +309,7 @@ static void nf_ct_frag6_expire(unsigned long data) out: spin_unlock(&fq->lock); - fq_put(fq); + fq_put(fq, NULL); } /* Creation primitives. */ @@ -318,7 +329,7 @@ static struct nf_ct_frag6_queue *nf_ct_frag6_intern(unsigned int hash, atomic_inc(&fq->refcnt); write_unlock(&nf_ct_frag6_lock); fq_in->last_in |= COMPLETE; - fq_put(fq_in); + fq_put(fq_in, NULL); return fq; } } @@ -535,7 +546,7 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, fq->fragments = next; fq->meat -= free_it->len; - frag_kfree_skb(free_it); + frag_kfree_skb(free_it, NULL); } } @@ -811,7 +822,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb) if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) { spin_unlock(&fq->lock); DEBUGP("Can't insert skb to queue\n"); - fq_put(fq); + fq_put(fq, NULL); goto ret_orig; } @@ -822,7 +833,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb) } spin_unlock(&fq->lock); - fq_put(fq); + fq_put(fq, NULL); return ret_skb; ret_orig: @@ -881,5 +892,6 @@ int nf_ct_frag6_init(void) void nf_ct_frag6_cleanup(void) { del_timer(&nf_ct_frag6_secret_timer); + nf_ct_frag6_low_thresh = 0; nf_ct_frag6_evictor(); } diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 651c79b41ee..a66900cda2a 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -298,13 +298,10 @@ void rawv6_err(struct sock *sk, struct sk_buff *skb, static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb) { if ((raw6_sk(sk)->checksum || sk->sk_filter) && - skb->ip_summed != CHECKSUM_UNNECESSARY) { - if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { - /* FIXME: increment a raw6 drops counter here */ - kfree_skb(skb); - return 0; - } - skb->ip_summed = CHECKSUM_UNNECESSARY; + skb_checksum_complete(skb)) { + /* FIXME: increment a raw6 drops counter here */ + kfree_skb(skb); + return 0; } /* Charge it to the socket. */ @@ -337,32 +334,25 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) if (!rp->checksum) skb->ip_summed = CHECKSUM_UNNECESSARY; - if (skb->ip_summed != CHECKSUM_UNNECESSARY) { - if (skb->ip_summed == CHECKSUM_HW) { - skb_postpull_rcsum(skb, skb->nh.raw, - skb->h.raw - skb->nh.raw); + if (skb->ip_summed == CHECKSUM_HW) { + skb_postpull_rcsum(skb, skb->nh.raw, + skb->h.raw - skb->nh.raw); + if (!csum_ipv6_magic(&skb->nh.ipv6h->saddr, + &skb->nh.ipv6h->daddr, + skb->len, inet->num, skb->csum)) skb->ip_summed = CHECKSUM_UNNECESSARY; - if (csum_ipv6_magic(&skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr, - skb->len, inet->num, skb->csum)) { - LIMIT_NETDEBUG(KERN_DEBUG "raw v6 hw csum failure.\n"); - skb->ip_summed = CHECKSUM_NONE; - } - } - if (skb->ip_summed == CHECKSUM_NONE) - skb->csum = ~csum_ipv6_magic(&skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr, - skb->len, inet->num, 0); } + if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->csum = ~csum_ipv6_magic(&skb->nh.ipv6h->saddr, + &skb->nh.ipv6h->daddr, + skb->len, inet->num, 0); if (inet->hdrincl) { - if (skb->ip_summed != CHECKSUM_UNNECESSARY && - (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { + if (skb_checksum_complete(skb)) { /* FIXME: increment a raw6 drops counter here */ kfree_skb(skb); return 0; } - skb->ip_summed = CHECKSUM_UNNECESSARY; } rawv6_rcv_skb(sk, skb); @@ -407,7 +397,7 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, if (skb->ip_summed==CHECKSUM_UNNECESSARY) { err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); } else if (msg->msg_flags&MSG_TRUNC) { - if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) + if (__skb_checksum_complete(skb)) goto csum_copy_err; err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); } else { @@ -758,7 +748,9 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, } if (opt == NULL) opt = np->opt; - opt = fl6_merge_options(&opt_space, flowlabel, opt); + if (flowlabel) + opt = fl6_merge_options(&opt_space, flowlabel, opt); + opt = ipv6_fixup_options(&opt_space, opt); fl.proto = proto; rawv6_probe_proto_opt(&fl, msg); diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index e4fe9ee484d..5d316cb72ec 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -74,7 +74,7 @@ struct ip6frag_skb_cb struct frag_queue { - struct frag_queue *next; + struct hlist_node list; struct list_head lru_list; /* lru list member */ __u32 id; /* fragment id */ @@ -95,14 +95,13 @@ struct frag_queue #define FIRST_IN 2 #define LAST_IN 1 __u16 nhoffset; - struct frag_queue **pprev; }; /* Hash table. */ #define IP6Q_HASHSZ 64 -static struct frag_queue *ip6_frag_hash[IP6Q_HASHSZ]; +static struct hlist_head ip6_frag_hash[IP6Q_HASHSZ]; static DEFINE_RWLOCK(ip6_frag_lock); static u32 ip6_frag_hash_rnd; static LIST_HEAD(ip6_frag_lru_list); @@ -110,9 +109,7 @@ int ip6_frag_nqueues = 0; static __inline__ void __fq_unlink(struct frag_queue *fq) { - if(fq->next) - fq->next->pprev = fq->pprev; - *fq->pprev = fq->next; + hlist_del(&fq->list); list_del(&fq->lru_list); ip6_frag_nqueues--; } @@ -163,28 +160,21 @@ static void ip6_frag_secret_rebuild(unsigned long dummy) get_random_bytes(&ip6_frag_hash_rnd, sizeof(u32)); for (i = 0; i < IP6Q_HASHSZ; i++) { struct frag_queue *q; + struct hlist_node *p, *n; - q = ip6_frag_hash[i]; - while (q) { - struct frag_queue *next = q->next; + hlist_for_each_entry_safe(q, p, n, &ip6_frag_hash[i], list) { unsigned int hval = ip6qhashfn(q->id, &q->saddr, &q->daddr); if (hval != i) { - /* Unlink. */ - if (q->next) - q->next->pprev = q->pprev; - *q->pprev = q->next; + hlist_del(&q->list); /* Relink to new hash chain. */ - if ((q->next = ip6_frag_hash[hval]) != NULL) - q->next->pprev = &q->next; - ip6_frag_hash[hval] = q; - q->pprev = &ip6_frag_hash[hval]; - } + hlist_add_head(&q->list, + &ip6_frag_hash[hval]); - q = next; + } } } write_unlock(&ip6_frag_lock); @@ -337,10 +327,13 @@ static struct frag_queue *ip6_frag_intern(unsigned int hash, struct frag_queue *fq_in) { struct frag_queue *fq; +#ifdef CONFIG_SMP + struct hlist_node *n; +#endif write_lock(&ip6_frag_lock); #ifdef CONFIG_SMP - for (fq = ip6_frag_hash[hash]; fq; fq = fq->next) { + hlist_for_each_entry(fq, n, &ip6_frag_hash[hash], list) { if (fq->id == fq_in->id && ipv6_addr_equal(&fq_in->saddr, &fq->saddr) && ipv6_addr_equal(&fq_in->daddr, &fq->daddr)) { @@ -358,10 +351,7 @@ static struct frag_queue *ip6_frag_intern(unsigned int hash, atomic_inc(&fq->refcnt); atomic_inc(&fq->refcnt); - if((fq->next = ip6_frag_hash[hash]) != NULL) - fq->next->pprev = &fq->next; - ip6_frag_hash[hash] = fq; - fq->pprev = &ip6_frag_hash[hash]; + hlist_add_head(&fq->list, &ip6_frag_hash[hash]); INIT_LIST_HEAD(&fq->lru_list); list_add_tail(&fq->lru_list, &ip6_frag_lru_list); ip6_frag_nqueues++; @@ -401,10 +391,11 @@ static __inline__ struct frag_queue * fq_find(u32 id, struct in6_addr *src, struct in6_addr *dst) { struct frag_queue *fq; + struct hlist_node *n; unsigned int hash = ip6qhashfn(id, src, dst); read_lock(&ip6_frag_lock); - for(fq = ip6_frag_hash[hash]; fq; fq = fq->next) { + hlist_for_each_entry(fq, n, &ip6_frag_hash[hash], list) { if (fq->id == id && ipv6_addr_equal(src, &fq->saddr) && ipv6_addr_equal(dst, &fq->daddr)) { diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f7f42c3e96c..66140f13d11 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -413,11 +413,14 @@ static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr, rt = ip6_rt_copy(ort); if (rt) { - ipv6_addr_copy(&rt->rt6i_dst.addr, daddr); - - if (!(rt->rt6i_flags&RTF_GATEWAY)) + if (!(rt->rt6i_flags&RTF_GATEWAY)) { + if (rt->rt6i_dst.plen != 128 && + ipv6_addr_equal(&rt->rt6i_dst.addr, daddr)) + rt->rt6i_flags |= RTF_ANYCAST; ipv6_addr_copy(&rt->rt6i_gateway, daddr); + } + ipv6_addr_copy(&rt->rt6i_dst.addr, daddr); rt->rt6i_dst.plen = 128; rt->rt6i_flags |= RTF_CACHE; rt->u.dst.flags |= DST_HOST; @@ -829,7 +832,7 @@ int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, } rt->u.dst.obsolete = -1; - rt->rt6i_expires = clock_t_to_jiffies(rtmsg->rtmsg_info); + rt->rt6i_expires = jiffies + clock_t_to_jiffies(rtmsg->rtmsg_info); if (nlh && (r = NLMSG_DATA(nlh))) { rt->rt6i_protocol = r->rtm_protocol; } else { @@ -1413,7 +1416,9 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, rt->u.dst.obsolete = -1; rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; - if (!anycast) + if (anycast) + rt->rt6i_flags |= RTF_ANYCAST; + else rt->rt6i_flags |= RTF_LOCAL; rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway); if (rt->rt6i_nexthop == NULL) { @@ -1701,10 +1706,8 @@ static void fib6_dump_end(struct netlink_callback *cb) fib6_walker_unlink(w); kfree(w); } - if (cb->args[1]) { - cb->done = (void*)cb->args[1]; - cb->args[1] = 0; - } + cb->done = (void*)cb->args[1]; + cb->args[1] = 0; } static int fib6_dump_done(struct netlink_callback *cb) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d746d3b27ef..8827389abaf 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -992,13 +992,12 @@ static void tcp_v6_send_reset(struct sk_buff *skb) /* sk = NULL, but it is safe for now. RST socket required. */ if (!ip6_dst_lookup(NULL, &buff->dst, &fl)) { - if ((xfrm_lookup(&buff->dst, &fl, NULL, 0)) < 0) + if (xfrm_lookup(&buff->dst, &fl, NULL, 0) >= 0) { + ip6_xmit(NULL, buff, &fl, NULL, 0); + TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); + TCP_INC_STATS_BH(TCP_MIB_OUTRSTS); return; - - ip6_xmit(NULL, buff, &fl, NULL, 0); - TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); - TCP_INC_STATS_BH(TCP_MIB_OUTRSTS); - return; + } } kfree_skb(buff); @@ -1057,11 +1056,11 @@ static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 fl.fl_ip_sport = t1->source; if (!ip6_dst_lookup(NULL, &buff->dst, &fl)) { - if ((xfrm_lookup(&buff->dst, &fl, NULL, 0)) < 0) + if (xfrm_lookup(&buff->dst, &fl, NULL, 0) >= 0) { + ip6_xmit(NULL, buff, &fl, NULL, 0); + TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); return; - ip6_xmit(NULL, buff, &fl, NULL, 0); - TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); - return; + } } kfree_skb(buff); @@ -1401,20 +1400,18 @@ out: static int tcp_v6_checksum_init(struct sk_buff *skb) { if (skb->ip_summed == CHECKSUM_HW) { - skb->ip_summed = CHECKSUM_UNNECESSARY; if (!tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr,skb->csum)) + &skb->nh.ipv6h->daddr,skb->csum)) { + skb->ip_summed = CHECKSUM_UNNECESSARY; return 0; - LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v6 csum failed\n"); + } } + + skb->csum = ~tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, + &skb->nh.ipv6h->daddr, 0); + if (skb->len <= 76) { - if (tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr,skb_checksum(skb, 0, skb->len, 0))) - return -1; - skb->ip_summed = CHECKSUM_UNNECESSARY; - } else { - skb->csum = ~tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr,0); + return __skb_checksum_complete(skb); } return 0; } @@ -1575,7 +1572,7 @@ static int tcp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) goto discard_it; if ((skb->ip_summed != CHECKSUM_UNNECESSARY && - tcp_v6_checksum_init(skb) < 0)) + tcp_v6_checksum_init(skb))) goto bad_packet; th = skb->h.th; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index bf9519341fd..5cc8731eb55 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -248,7 +248,7 @@ try_again: err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, copied); } else if (msg->msg_flags&MSG_TRUNC) { - if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) + if (__skb_checksum_complete(skb)) goto csum_copy_err; err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, copied); @@ -363,13 +363,10 @@ static inline int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) return -1; } - if (skb->ip_summed != CHECKSUM_UNNECESSARY) { - if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { - UDP6_INC_STATS_BH(UDP_MIB_INERRORS); - kfree_skb(skb); - return 0; - } - skb->ip_summed = CHECKSUM_UNNECESSARY; + if (skb_checksum_complete(skb)) { + UDP6_INC_STATS_BH(UDP_MIB_INERRORS); + kfree_skb(skb); + return 0; } if (sock_queue_rcv_skb(sk,skb)<0) { @@ -491,13 +488,10 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) uh = skb->h.uh; } - if (skb->ip_summed==CHECKSUM_HW) { + if (skb->ip_summed == CHECKSUM_HW && + !csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) skb->ip_summed = CHECKSUM_UNNECESSARY; - if (csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) { - LIMIT_NETDEBUG(KERN_DEBUG "udp v6 hw csum failure.\n"); - skb->ip_summed = CHECKSUM_NONE; - } - } + if (skb->ip_summed != CHECKSUM_UNNECESSARY) skb->csum = ~csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, 0); @@ -521,8 +515,7 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard; - if (skb->ip_summed != CHECKSUM_UNNECESSARY && - (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) + if (skb_checksum_complete(skb)) goto discard; UDP6_INC_STATS_BH(UDP_MIB_NOPORTS); @@ -778,7 +771,9 @@ do_udp_sendmsg: } if (opt == NULL) opt = np->opt; - opt = fl6_merge_options(&opt_space, flowlabel, opt); + if (flowlabel) + opt = fl6_merge_options(&opt_space, flowlabel, opt); + opt = ipv6_fixup_options(&opt_space, opt); fl->proto = IPPROTO_UDP; ipv6_addr_copy(&fl->fl6_dst, daddr); diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index cf1d91e74c8..69bd957380e 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -214,6 +214,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl) case IPPROTO_UDP: case IPPROTO_TCP: case IPPROTO_SCTP: + case IPPROTO_DCCP: if (pskb_may_pull(skb, skb->nh.raw + offset + 4 - skb->data)) { u16 *ports = (u16 *)exthdr; diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 59d02cbbeb9..c3f0b078345 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -116,7 +116,9 @@ static int llc_ui_send_data(struct sock* sk, struct sk_buff *skb, int noblock) struct llc_sock* llc = llc_sk(sk); int rc = 0; - if (unlikely(llc_data_accept_state(llc->state) || llc->p_flag)) { + if (unlikely(llc_data_accept_state(llc->state) || + llc->remote_busy_flag || + llc->p_flag)) { long timeout = sock_sndtimeo(sk, noblock); rc = llc_ui_wait_for_busy_core(sk, timeout); @@ -542,6 +544,7 @@ static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout) if (sk_wait_event(sk, &timeout, (sk->sk_shutdown & RCV_SHUTDOWN) || (!llc_data_accept_state(llc->state) && + !llc->remote_busy_flag && !llc->p_flag))) break; rc = -ERESTARTSYS; diff --git a/net/llc/llc_c_ac.c b/net/llc/llc_c_ac.c index b0bcfb1f12d..8169f24ed33 100644 --- a/net/llc/llc_c_ac.c +++ b/net/llc/llc_c_ac.c @@ -866,7 +866,8 @@ int llc_conn_ac_send_ack_if_needed(struct sock *sk, struct sk_buff *skb) llc->ack_must_be_send = 1; llc->ack_pf = pf_bit & 1; } - if (((llc->vR - llc->first_pdu_Ns + 129) % 128) >= llc->npta) { + if (((llc->vR - llc->first_pdu_Ns + 1 + LLC_2_SEQ_NBR_MODULO) + % LLC_2_SEQ_NBR_MODULO) >= llc->npta) { llc_conn_ac_send_rr_rsp_f_set_ackpf(sk, skb); llc->ack_must_be_send = 0; llc->ack_pf = 0; @@ -994,8 +995,8 @@ static int llc_conn_ac_inc_npta_value(struct sock *sk, struct sk_buff *skb) llc->dec_step = 0; llc->dec_cntr = llc->inc_cntr = 2; ++llc->npta; - if (llc->npta > 127) - llc->npta = 127 ; + if (llc->npta > (u8) ~LLC_2_SEQ_NBR_MODULO) + llc->npta = (u8) ~LLC_2_SEQ_NBR_MODULO; } else --llc->inc_cntr; return 0; @@ -1065,9 +1066,10 @@ int llc_conn_ac_dec_tx_win_size(struct sock *sk, struct sk_buff *skb) struct llc_sock *llc = llc_sk(sk); u8 unacked_pdu = skb_queue_len(&llc->pdu_unack_q); - llc->k -= unacked_pdu; - if (llc->k < 2) - llc->k = 2; + if (llc->k - unacked_pdu < 1) + llc->k = 1; + else + llc->k -= unacked_pdu; return 0; } @@ -1084,8 +1086,8 @@ int llc_conn_ac_inc_tx_win_size(struct sock *sk, struct sk_buff *skb) struct llc_sock *llc = llc_sk(sk); llc->k += 1; - if (llc->k > 128) - llc->k = 128 ; + if (llc->k > (u8) ~LLC_2_SEQ_NBR_MODULO) + llc->k = (u8) ~LLC_2_SEQ_NBR_MODULO; return 0; } @@ -1309,7 +1311,7 @@ int llc_conn_ac_set_vs_nr(struct sock *sk, struct sk_buff *skb) static int llc_conn_ac_inc_vs_by_1(struct sock *sk, struct sk_buff *skb) { - llc_sk(sk)->vS = (llc_sk(sk)->vS + 1) % 128; + llc_sk(sk)->vS = (llc_sk(sk)->vS + 1) % LLC_2_SEQ_NBR_MODULO; return 0; } diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index a84f9221e5f..794c41d19b2 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -61,8 +61,8 @@ config NF_CONNTRACK_MARK instead of the individual packets. config NF_CONNTRACK_EVENTS - bool "Connection tracking events" - depends on NF_CONNTRACK + bool "Connection tracking events (EXPERIMENTAL)" + depends on EXPERIMENTAL && NF_CONNTRACK help If this option is enabled, the connection tracking code will provide a notifier chain that can be used by other kernel code diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 9a67c796b38..a7c7b490cf2 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -387,7 +387,7 @@ nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, static void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) { ASSERT_WRITE_LOCK(&nf_conntrack_lock); - NF_CT_ASSERT(!timer_pending(&exp_timeout)); + NF_CT_ASSERT(!timer_pending(&exp->timeout)); list_del(&exp->list); NF_CT_STAT_INC(expect_delete); exp->master->expecting--; @@ -1383,6 +1383,9 @@ void nf_conntrack_cleanup(void) schedule(); goto i_see_dead_people; } + /* wait until all references to nf_conntrack_untracked are dropped */ + while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1) + schedule(); for (i = 0; i < NF_CT_F_NUM; i++) { if (nf_ct_cache[i].use == 0) @@ -1395,6 +1398,13 @@ void nf_conntrack_cleanup(void) kmem_cache_destroy(nf_conntrack_expect_cachep); free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc, nf_conntrack_htable_size); + + /* free l3proto protocol tables */ + for (i = 0; i < PF_MAX; i++) + if (nf_ct_protos[i]) { + kfree(nf_ct_protos[i]); + nf_ct_protos[i] = NULL; + } } static struct list_head *alloc_hashtable(int size, int *vmalloced) diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 83d90dd624f..6035633d822 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -280,9 +280,9 @@ static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sCL -> sCL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*ack*/ { sIV, sIV, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV }, +/*ack*/ { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV }, /* - * sSS -> sIV Might be a half-open connection. + * sSS -> sIG Might be a half-open connection. * sSR -> sSR Might answer late resent SYN. * sES -> sES :-) * sFW -> sCW Normal close request answered by ACK. @@ -779,6 +779,7 @@ static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] = { [TH_SYN] = 1, [TH_SYN|TH_ACK] = 1, + [TH_SYN|TH_PUSH] = 1, [TH_SYN|TH_ACK|TH_PUSH] = 1, [TH_RST] = 1, [TH_RST|TH_ACK] = 1, @@ -911,8 +912,12 @@ static int tcp_packet(struct nf_conn *conntrack, switch (new_state) { case TCP_CONNTRACK_IGNORE: - /* Either SYN in ORIGINAL - * or SYN/ACK in REPLY. */ + /* Ignored packets: + * + * a) SYN in ORIGINAL + * b) SYN/ACK in REPLY + * c) ACK in reply direction after initial SYN in original. + */ if (index == TCP_SYNACK_SET && conntrack->proto.tcp.last_index == TCP_SYN_SET && conntrack->proto.tcp.last_dir != dir @@ -969,16 +974,29 @@ static int tcp_packet(struct nf_conn *conntrack, conntrack->timeout.function((unsigned long) conntrack); return -NF_REPEAT; + } else { + write_unlock_bh(&tcp_lock); + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, + NULL, "nf_ct_tcp: invalid SYN"); + return -NF_ACCEPT; } case TCP_CONNTRACK_CLOSE: if (index == TCP_RST_SET - && test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status) - && conntrack->proto.tcp.last_index == TCP_SYN_SET + && ((test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status) + && conntrack->proto.tcp.last_index == TCP_SYN_SET) + || (!test_bit(IPS_ASSURED_BIT, &conntrack->status) + && conntrack->proto.tcp.last_index == TCP_ACK_SET)) && ntohl(th->ack_seq) == conntrack->proto.tcp.last_end) { - /* RST sent to invalid SYN we had let trough - * SYN was in window then, tear down connection. + /* RST sent to invalid SYN or ACK we had let trough + * at a) and c) above: + * + * a) SYN was in window then + * c) we hold a half-open connection. + * + * Delete our connection entry. * We skip window checking, because packet might ACK - * segments we ignored in the SYN. */ + * segments we ignored. */ goto in_window; } /* Just fall trough */ diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 45224db4fe2..5af381f9fe3 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -694,7 +694,7 @@ static int init_or_cleanup(int init) cleanup_proc_stat: #endif #ifdef CONFIG_PROC_FS - proc_net_remove("nf_conntrack_stat"); + remove_proc_entry("nf_conntrack", proc_net_stat); cleanup_proc_exp: proc_net_remove("nf_conntrack_expect"); cleanup_proc: diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 83f4c53030f..95fdf04f1d8 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -162,7 +162,7 @@ nfnetlink_check_attributes(struct nfnetlink_subsystem *subsys, return -EINVAL; } - min_len = NLMSG_ALIGN(sizeof(struct nfgenmsg)); + min_len = NLMSG_SPACE(sizeof(struct nfgenmsg)); if (unlikely(nlh->nlmsg_len < min_len)) return -EINVAL; @@ -223,6 +223,12 @@ static inline int nfnetlink_rcv_msg(struct sk_buff *skb, NFNL_SUBSYS_ID(nlh->nlmsg_type), NFNL_MSG_TYPE(nlh->nlmsg_type)); + if (!cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)) { + DEBUGP("missing CAP_NET_ADMIN\n"); + *errp = -EPERM; + return -1; + } + /* Only requests are handled by kernel now. */ if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) { DEBUGP("received non-request message\n"); @@ -230,8 +236,7 @@ static inline int nfnetlink_rcv_msg(struct sk_buff *skb, } /* All the messages must at least contain nfgenmsg */ - if (nlh->nlmsg_len < - NLMSG_LENGTH(NLMSG_ALIGN(sizeof(struct nfgenmsg)))) { + if (nlh->nlmsg_len < NLMSG_SPACE(sizeof(struct nfgenmsg))) { DEBUGP("received message was too short\n"); return 0; } @@ -240,15 +245,12 @@ static inline int nfnetlink_rcv_msg(struct sk_buff *skb, ss = nfnetlink_get_subsys(type); if (!ss) { #ifdef CONFIG_KMOD - if (cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)) { - /* don't call nfnl_shunlock, since it would reenter - * with further packet processing */ - up(&nfnl_sem); - request_module("nfnetlink-subsys-%d", - NFNL_SUBSYS_ID(type)); - nfnl_shlock(); - ss = nfnetlink_get_subsys(type); - } + /* don't call nfnl_shunlock, since it would reenter + * with further packet processing */ + up(&nfnl_sem); + request_module("nfnetlink-subsys-%d", NFNL_SUBSYS_ID(type)); + nfnl_shlock(); + ss = nfnetlink_get_subsys(type); if (!ss) #endif goto err_inval; @@ -260,13 +262,6 @@ static inline int nfnetlink_rcv_msg(struct sk_buff *skb, goto err_inval; } - if (nc->cap_required && - !cap_raised(NETLINK_CB(skb).eff_cap, nc->cap_required)) { - DEBUGP("permission denied for type %d\n", type); - *errp = -EPERM; - return -1; - } - { u_int16_t attr_count = ss->cb[NFNL_MSG_TYPE(nlh->nlmsg_type)].attr_count; diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index d194676f365..cba63729313 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -862,11 +862,9 @@ out_put: static struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = { [NFULNL_MSG_PACKET] = { .call = nfulnl_recv_unsupp, - .attr_count = NFULA_MAX, - .cap_required = CAP_NET_ADMIN, }, + .attr_count = NFULA_MAX, }, [NFULNL_MSG_CONFIG] = { .call = nfulnl_recv_config, - .attr_count = NFULA_CFG_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = NFULA_CFG_MAX, }, }; static struct nfnetlink_subsystem nfulnl_subsys = { diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index f065a6c9495..f28460b61e4 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -931,14 +931,11 @@ out_put: static struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = { [NFQNL_MSG_PACKET] = { .call = nfqnl_recv_unsupp, - .attr_count = NFQA_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = NFQA_MAX, }, [NFQNL_MSG_VERDICT] = { .call = nfqnl_recv_verdict, - .attr_count = NFQA_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = NFQA_MAX, }, [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config, - .attr_count = NFQA_CFG_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = NFQA_CFG_MAX, }, }; static struct nfnetlink_subsystem nfqnl_subsys = { diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 8c38ee6d255..96020d7087e 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -476,7 +476,7 @@ static int netlink_autobind(struct socket *sock) struct hlist_head *head; struct sock *osk; struct hlist_node *node; - s32 pid = current->pid; + s32 pid = current->tgid; int err; static s32 rover = -4097; diff --git a/net/netrom/nr_in.c b/net/netrom/nr_in.c index 004e8599b8f..a7d88b5ad75 100644 --- a/net/netrom/nr_in.c +++ b/net/netrom/nr_in.c @@ -99,7 +99,7 @@ static int nr_state1_machine(struct sock *sk, struct sk_buff *skb, break; case NR_RESET: - if (sysctl_netrom_reset_circuit); + if (sysctl_netrom_reset_circuit) nr_disconnect(sk, ECONNRESET); break; @@ -130,7 +130,7 @@ static int nr_state2_machine(struct sock *sk, struct sk_buff *skb, break; case NR_RESET: - if (sysctl_netrom_reset_circuit); + if (sysctl_netrom_reset_circuit) nr_disconnect(sk, ECONNRESET); break; @@ -265,7 +265,7 @@ static int nr_state3_machine(struct sock *sk, struct sk_buff *skb, int frametype break; case NR_RESET: - if (sysctl_netrom_reset_circuit); + if (sysctl_netrom_reset_circuit) nr_disconnect(sk, ECONNRESET); break; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 499ae3df4a4..3e246276041 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1587,23 +1587,47 @@ static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order) return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1); } -static void free_pg_vec(char **pg_vec, unsigned order, unsigned len) +static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len) { int i; - for (i=0; i<len; i++) { - if (pg_vec[i]) { - struct page *page, *pend; - - pend = pg_vec_endpage(pg_vec[i], order); - for (page = virt_to_page(pg_vec[i]); page <= pend; page++) - ClearPageReserved(page); - free_pages((unsigned long)pg_vec[i], order); - } + for (i = 0; i < len; i++) { + if (likely(pg_vec[i])) + free_pages((unsigned long) pg_vec[i], order); } kfree(pg_vec); } +static inline char *alloc_one_pg_vec_page(unsigned long order) +{ + return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO, + order); +} + +static char **alloc_pg_vec(struct tpacket_req *req, int order) +{ + unsigned int block_nr = req->tp_block_nr; + char **pg_vec; + int i; + + pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL); + if (unlikely(!pg_vec)) + goto out; + + for (i = 0; i < block_nr; i++) { + pg_vec[i] = alloc_one_pg_vec_page(order); + if (unlikely(!pg_vec[i])) + goto out_free_pgvec; + } + +out: + return pg_vec; + +out_free_pgvec: + free_pg_vec(pg_vec, order, block_nr); + pg_vec = NULL; + goto out; +} static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing) { @@ -1617,64 +1641,46 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing /* Sanity tests and some calculations */ - if (po->pg_vec) + if (unlikely(po->pg_vec)) return -EBUSY; - if ((int)req->tp_block_size <= 0) + if (unlikely((int)req->tp_block_size <= 0)) return -EINVAL; - if (req->tp_block_size&(PAGE_SIZE-1)) + if (unlikely(req->tp_block_size & (PAGE_SIZE - 1))) return -EINVAL; - if (req->tp_frame_size < TPACKET_HDRLEN) + if (unlikely(req->tp_frame_size < TPACKET_HDRLEN)) return -EINVAL; - if (req->tp_frame_size&(TPACKET_ALIGNMENT-1)) + if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1))) return -EINVAL; po->frames_per_block = req->tp_block_size/req->tp_frame_size; - if (po->frames_per_block <= 0) + if (unlikely(po->frames_per_block <= 0)) return -EINVAL; - if (po->frames_per_block*req->tp_block_nr != req->tp_frame_nr) + if (unlikely((po->frames_per_block * req->tp_block_nr) != + req->tp_frame_nr)) return -EINVAL; - /* OK! */ - - /* Allocate page vector */ - while ((PAGE_SIZE<<order) < req->tp_block_size) - order++; err = -ENOMEM; - - pg_vec = kmalloc(req->tp_block_nr*sizeof(char *), GFP_KERNEL); - if (pg_vec == NULL) + order = get_order(req->tp_block_size); + pg_vec = alloc_pg_vec(req, order); + if (unlikely(!pg_vec)) goto out; - memset(pg_vec, 0, req->tp_block_nr*sizeof(char **)); - - for (i=0; i<req->tp_block_nr; i++) { - struct page *page, *pend; - pg_vec[i] = (char *)__get_free_pages(GFP_KERNEL, order); - if (!pg_vec[i]) - goto out_free_pgvec; - - pend = pg_vec_endpage(pg_vec[i], order); - for (page = virt_to_page(pg_vec[i]); page <= pend; page++) - SetPageReserved(page); - } - /* Page vector is allocated */ l = 0; - for (i=0; i<req->tp_block_nr; i++) { + for (i = 0; i < req->tp_block_nr; i++) { char *ptr = pg_vec[i]; struct tpacket_hdr *header; int k; - for (k=0; k<po->frames_per_block; k++) { - - header = (struct tpacket_hdr*)ptr; + for (k = 0; k < po->frames_per_block; k++) { + header = (struct tpacket_hdr *) ptr; header->tp_status = TP_STATUS_KERNEL; ptr += req->tp_frame_size; } } /* Done */ } else { - if (req->tp_frame_nr) + if (unlikely(req->tp_frame_nr)) return -EINVAL; } @@ -1701,7 +1707,7 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing spin_lock_bh(&sk->sk_receive_queue.lock); pg_vec = XC(po->pg_vec, pg_vec); - po->frame_max = req->tp_frame_nr-1; + po->frame_max = (req->tp_frame_nr - 1); po->head = 0; po->frame_size = req->tp_frame_size; spin_unlock_bh(&sk->sk_receive_queue.lock); @@ -1728,7 +1734,6 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing release_sock(sk); -out_free_pgvec: if (pg_vec) free_pg_vec(pg_vec, order, req->tp_block_nr); out: @@ -1755,17 +1760,19 @@ static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_st if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE) goto out; - atomic_inc(&po->mapped); start = vma->vm_start; - err = -EAGAIN; - for (i=0; i<po->pg_vec_len; i++) { - if (remap_pfn_range(vma, start, - __pa(po->pg_vec[i]) >> PAGE_SHIFT, - po->pg_vec_pages*PAGE_SIZE, - vma->vm_page_prot)) - goto out; - start += po->pg_vec_pages*PAGE_SIZE; + for (i = 0; i < po->pg_vec_len; i++) { + struct page *page = virt_to_page(po->pg_vec[i]); + int pg_num; + + for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) { + err = vm_insert_page(vma, start, page); + if (unlikely(err)) + goto out; + start += PAGE_SIZE; + } } + atomic_inc(&po->mapped); vma->vm_ops = &packet_mmap_ops; err = 0; diff --git a/net/rxrpc/transport.c b/net/rxrpc/transport.c index 122c086ee2d..dbe6105e83a 100644 --- a/net/rxrpc/transport.c +++ b/net/rxrpc/transport.c @@ -23,6 +23,7 @@ #include <linux/in.h> #include <linux/in6.h> #include <linux/icmp.h> +#include <linux/skbuff.h> #include <net/sock.h> #include <net/ip.h> #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) @@ -475,15 +476,11 @@ void rxrpc_trans_receive_packet(struct rxrpc_transport *trans) /* we'll probably need to checksum it (didn't call * sock_recvmsg) */ - if (pkt->ip_summed != CHECKSUM_UNNECESSARY) { - if ((unsigned short) - csum_fold(skb_checksum(pkt, 0, pkt->len, - pkt->csum))) { - kfree_skb(pkt); - rxrpc_krxiod_queue_transport(trans); - _leave(" CSUM failed"); - return; - } + if (skb_checksum_complete(pkt)) { + kfree_skb(pkt); + rxrpc_krxiod_queue_transport(trans); + _leave(" CSUM failed"); + return; } addr = pkt->nh.iph->saddr; diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 7f34e7fd767..55cd5327fbd 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -40,9 +40,10 @@ config NET_SCHED The available schedulers are listed in the following questions; you can say Y to as many as you like. If unsure, say N now. +if NET_SCHED + choice prompt "Packet scheduler clock source" - depends on NET_SCHED default NET_SCH_CLK_JIFFIES ---help--- Packet schedulers need a monotonic clock that increments at a static @@ -98,11 +99,9 @@ config NET_SCH_CLK_CPU endchoice comment "Queueing/Scheduling" - depends on NET_SCHED config NET_SCH_CBQ tristate "Class Based Queueing (CBQ)" - depends on NET_SCHED ---help--- Say Y here if you want to use the Class-Based Queueing (CBQ) packet scheduling algorithm. This algorithm classifies the waiting packets @@ -120,7 +119,6 @@ config NET_SCH_CBQ config NET_SCH_HTB tristate "Hierarchical Token Bucket (HTB)" - depends on NET_SCHED ---help--- Say Y here if you want to use the Hierarchical Token Buckets (HTB) packet scheduling algorithm. See @@ -135,7 +133,6 @@ config NET_SCH_HTB config NET_SCH_HFSC tristate "Hierarchical Fair Service Curve (HFSC)" - depends on NET_SCHED ---help--- Say Y here if you want to use the Hierarchical Fair Service Curve (HFSC) packet scheduling algorithm. @@ -145,7 +142,7 @@ config NET_SCH_HFSC config NET_SCH_ATM tristate "ATM Virtual Circuits (ATM)" - depends on NET_SCHED && ATM + depends on ATM ---help--- Say Y here if you want to use the ATM pseudo-scheduler. This provides a framework for invoking classifiers, which in turn @@ -159,7 +156,6 @@ config NET_SCH_ATM config NET_SCH_PRIO tristate "Multi Band Priority Queueing (PRIO)" - depends on NET_SCHED ---help--- Say Y here if you want to use an n-band priority queue packet scheduler. @@ -169,7 +165,6 @@ config NET_SCH_PRIO config NET_SCH_RED tristate "Random Early Detection (RED)" - depends on NET_SCHED ---help--- Say Y here if you want to use the Random Early Detection (RED) packet scheduling algorithm. @@ -181,7 +176,6 @@ config NET_SCH_RED config NET_SCH_SFQ tristate "Stochastic Fairness Queueing (SFQ)" - depends on NET_SCHED ---help--- Say Y here if you want to use the Stochastic Fairness Queueing (SFQ) packet scheduling algorithm . @@ -193,7 +187,6 @@ config NET_SCH_SFQ config NET_SCH_TEQL tristate "True Link Equalizer (TEQL)" - depends on NET_SCHED ---help--- Say Y here if you want to use the True Link Equalizer (TLE) packet scheduling algorithm. This queueing discipline allows the combination @@ -206,7 +199,6 @@ config NET_SCH_TEQL config NET_SCH_TBF tristate "Token Bucket Filter (TBF)" - depends on NET_SCHED ---help--- Say Y here if you want to use the Token Bucket Filter (TBF) packet scheduling algorithm. @@ -218,7 +210,6 @@ config NET_SCH_TBF config NET_SCH_GRED tristate "Generic Random Early Detection (GRED)" - depends on NET_SCHED ---help--- Say Y here if you want to use the Generic Random Early Detection (GRED) packet scheduling algorithm for some of your network devices @@ -230,7 +221,6 @@ config NET_SCH_GRED config NET_SCH_DSMARK tristate "Differentiated Services marker (DSMARK)" - depends on NET_SCHED ---help--- Say Y if you want to schedule packets according to the Differentiated Services architecture proposed in RFC 2475. @@ -242,7 +232,6 @@ config NET_SCH_DSMARK config NET_SCH_NETEM tristate "Network emulator (NETEM)" - depends on NET_SCHED ---help--- Say Y if you want to emulate network delay, loss, and packet re-ordering. This is often useful to simulate networks when @@ -255,7 +244,6 @@ config NET_SCH_NETEM config NET_SCH_INGRESS tristate "Ingress Qdisc" - depends on NET_SCHED ---help--- Say Y here if you want to use classifiers for incoming packets. If unsure, say Y. @@ -264,14 +252,12 @@ config NET_SCH_INGRESS module will be called sch_ingress. comment "Classification" - depends on NET_SCHED config NET_CLS boolean config NET_CLS_BASIC tristate "Elementary classification (BASIC)" - depends NET_SCHED select NET_CLS ---help--- Say Y here if you want to be able to classify packets using @@ -282,7 +268,6 @@ config NET_CLS_BASIC config NET_CLS_TCINDEX tristate "Traffic-Control Index (TCINDEX)" - depends NET_SCHED select NET_CLS ---help--- Say Y here if you want to be able to classify packets based on @@ -294,7 +279,6 @@ config NET_CLS_TCINDEX config NET_CLS_ROUTE4 tristate "Routing decision (ROUTE)" - depends NET_SCHED select NET_CLS_ROUTE select NET_CLS ---help--- @@ -306,11 +290,9 @@ config NET_CLS_ROUTE4 config NET_CLS_ROUTE bool - default n config NET_CLS_FW tristate "Netfilter mark (FW)" - depends NET_SCHED select NET_CLS ---help--- If you say Y here, you will be able to classify packets @@ -321,7 +303,6 @@ config NET_CLS_FW config NET_CLS_U32 tristate "Universal 32bit comparisons w/ hashing (U32)" - depends NET_SCHED select NET_CLS ---help--- Say Y here to be able to classify packetes using a universal @@ -345,7 +326,6 @@ config CLS_U32_MARK config NET_CLS_RSVP tristate "IPv4 Resource Reservation Protocol (RSVP)" - depends on NET_SCHED select NET_CLS select NET_ESTIMATOR ---help--- @@ -361,7 +341,6 @@ config NET_CLS_RSVP config NET_CLS_RSVP6 tristate "IPv6 Resource Reservation Protocol (RSVP6)" - depends on NET_SCHED select NET_CLS select NET_ESTIMATOR ---help--- @@ -377,7 +356,6 @@ config NET_CLS_RSVP6 config NET_EMATCH bool "Extended Matches" - depends NET_SCHED select NET_CLS ---help--- Say Y here if you want to use extended matches on top of classifiers @@ -456,7 +434,7 @@ config NET_EMATCH_TEXT config NET_CLS_ACT bool "Actions" - depends on EXPERIMENTAL && NET_SCHED + depends on EXPERIMENTAL select NET_ESTIMATOR ---help--- Say Y here if you want to use traffic control actions. Actions @@ -539,7 +517,7 @@ config NET_ACT_SIMP config NET_CLS_POLICE bool "Traffic Policing (obsolete)" - depends on NET_SCHED && NET_CLS_ACT!=y + depends on NET_CLS_ACT!=y select NET_ESTIMATOR ---help--- Say Y here if you want to do traffic policing, i.e. strict @@ -549,7 +527,7 @@ config NET_CLS_POLICE config NET_CLS_IND bool "Incoming device classification" - depends on NET_SCHED && (NET_CLS_U32 || NET_CLS_FW) + depends on NET_CLS_U32 || NET_CLS_FW ---help--- Say Y here to extend the u32 and fw classifier to support classification based on the incoming device. This option is @@ -557,11 +535,12 @@ config NET_CLS_IND config NET_ESTIMATOR bool "Rate estimator" - depends on NET_SCHED ---help--- Say Y here to allow using rate estimators to estimate the current rate-of-flow for network devices, queues, etc. This module is automaticaly selected if needed but can be selected manually for statstical purposes. +endif # NET_SCHED + endmenu diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 8aebe8f6d27..2ce1cb2aa2e 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -34,7 +34,7 @@ #include <net/sch_generic.h> #include <net/act_api.h> -#if 1 /* control */ +#if 0 /* control */ #define DPRINTK(format, args...) printk(KERN_DEBUG format, ##args) #else #define DPRINTK(format, args...) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index cdc8d283791..82fb07aa06a 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -464,7 +464,7 @@ static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) const struct netem_skb_cb *cb = (const struct netem_skb_cb *)skb->cb; - if (PSCHED_TLESS(cb->time_to_send, ncb->time_to_send)) + if (!PSCHED_TLESS(ncb->time_to_send, cb->time_to_send)) break; } diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 8c8ddf7f9b6..dec68a60477 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -128,9 +128,29 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a */ asoc->max_burst = sctp_max_burst; - /* Copy things from the endpoint. */ + /* initialize association timers */ + asoc->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0; + asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = asoc->rto_initial; + asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = asoc->rto_initial; + asoc->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = asoc->rto_initial; + asoc->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] = 0; + asoc->timeouts[SCTP_EVENT_TIMEOUT_T4_RTO] = 0; + + /* sctpimpguide Section 2.12.2 + * If the 'T5-shutdown-guard' timer is used, it SHOULD be set to the + * recommended value of 5 times 'RTO.Max'. + */ + asoc->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD] + = 5 * asoc->rto_max; + + asoc->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0; + asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = + SCTP_DEFAULT_TIMEOUT_SACK; + asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = + sp->autoclose * HZ; + + /* Initilizes the timers */ for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) { - asoc->timeouts[i] = ep->timeouts[i]; init_timer(&asoc->timers[i]); asoc->timers[i].function = sctp_timer_events[i]; asoc->timers[i].data = (unsigned long) asoc; @@ -157,10 +177,10 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a * RFC 6 - A SCTP receiver MUST be able to receive a minimum of * 1500 bytes in one SCTP packet. */ - if (sk->sk_rcvbuf < SCTP_DEFAULT_MINWINDOW) + if ((sk->sk_rcvbuf/2) < SCTP_DEFAULT_MINWINDOW) asoc->rwnd = SCTP_DEFAULT_MINWINDOW; else - asoc->rwnd = sk->sk_rcvbuf; + asoc->rwnd = sk->sk_rcvbuf/2; asoc->a_rwnd = asoc->rwnd; @@ -172,6 +192,9 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a /* Set the sndbuf size for transmit. */ asoc->sndbuf_used = 0; + /* Initialize the receive memory counter */ + atomic_set(&asoc->rmem_alloc, 0); + init_waitqueue_head(&asoc->wait); asoc->c.my_vtag = sctp_generate_tag(ep); @@ -380,6 +403,8 @@ static void sctp_association_destroy(struct sctp_association *asoc) spin_unlock_bh(&sctp_assocs_id_lock); } + BUG_TRAP(!atomic_read(&asoc->rmem_alloc)); + if (asoc->base.malloced) { kfree(asoc); SCTP_DBG_OBJCNT_DEC(assoc); diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 96984f7a2d6..67bd53070ee 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -70,7 +70,6 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, struct sock *sk, gfp_t gfp) { - struct sctp_sock *sp = sctp_sk(sk); memset(ep, 0, sizeof(struct sctp_endpoint)); /* Initialize the base structure. */ @@ -100,33 +99,14 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, /* Create the lists of associations. */ INIT_LIST_HEAD(&ep->asocs); - /* Set up the base timeout information. */ - ep->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0; - ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = - msecs_to_jiffies(sp->rtoinfo.srto_initial); - ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = - msecs_to_jiffies(sp->rtoinfo.srto_initial); - ep->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = - msecs_to_jiffies(sp->rtoinfo.srto_initial); - ep->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] = 0; - ep->timeouts[SCTP_EVENT_TIMEOUT_T4_RTO] = 0; - - /* sctpimpguide-05 Section 2.12.2 - * If the 'T5-shutdown-guard' timer is used, it SHOULD be set to the - * recommended value of 5 times 'RTO.Max'. - */ - ep->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD] - = 5 * msecs_to_jiffies(sp->rtoinfo.srto_max); - - ep->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0; - ep->timeouts[SCTP_EVENT_TIMEOUT_SACK] = sctp_sack_timeout; - ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ; - /* Use SCTP specific send buffer space queues. */ ep->sndbuf_policy = sctp_sndbuf_policy; sk->sk_write_space = sctp_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); + /* Get the receive buffer policy for this endpoint */ + ep->rcvbuf_policy = sctp_rcvbuf_policy; + /* Initialize the secret key used with cookie. */ get_random_bytes(&ep->secret_key[0], SCTP_SECRET_SIZE); ep->last_key = ep->current_key = 0; diff --git a/net/sctp/input.c b/net/sctp/input.c index 28f32243397..b24ff2c1aef 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -100,21 +100,6 @@ static inline int sctp_rcv_checksum(struct sk_buff *skb) return 0; } -/* The free routine for skbuffs that sctp receives */ -static void sctp_rfree(struct sk_buff *skb) -{ - atomic_sub(sizeof(struct sctp_chunk),&skb->sk->sk_rmem_alloc); - sock_rfree(skb); -} - -/* The ownership wrapper routine to do receive buffer accounting */ -static void sctp_rcv_set_owner_r(struct sk_buff *skb, struct sock *sk) -{ - skb_set_owner_r(skb,sk); - skb->destructor = sctp_rfree; - atomic_add(sizeof(struct sctp_chunk),&sk->sk_rmem_alloc); -} - struct sctp_input_cb { union { struct inet_skb_parm h4; @@ -217,9 +202,6 @@ int sctp_rcv(struct sk_buff *skb) rcvr = &ep->base; } - if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) - goto discard_release; - /* * RFC 2960, 8.4 - Handle "Out of the blue" Packets. * An SCTP packet is called an "out of the blue" (OOTB) @@ -256,8 +238,6 @@ int sctp_rcv(struct sk_buff *skb) } SCTP_INPUT_CB(skb)->chunk = chunk; - sctp_rcv_set_owner_r(skb,sk); - /* Remember what endpoint is to handle this packet. */ chunk->rcvr = rcvr; diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 26de4d3e1bd..f775d78aa59 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -530,6 +530,9 @@ static void sctp_v4_get_saddr(struct sctp_association *asoc, { struct rtable *rt = (struct rtable *)dst; + if (!asoc) + return; + if (rt) { saddr->v4.sin_family = AF_INET; saddr->v4.sin_port = asoc->base.bind_addr.port; @@ -1047,6 +1050,9 @@ SCTP_STATIC __init int sctp_init(void) /* Sendbuffer growth - do per-socket accounting */ sctp_sndbuf_policy = 0; + /* Rcvbuffer growth - do per-socket accounting */ + sctp_rcvbuf_policy = 0; + /* HB.interval - 30 seconds */ sctp_hb_interval = SCTP_DEFAULT_TIMEOUT_HEARTBEAT; diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index f84173ea8ec..823947170a3 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -385,7 +385,7 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = { NULL, sctp_generate_t4_rto_event, sctp_generate_t5_shutdown_guard_event, - sctp_generate_heartbeat_event, + NULL, sctp_generate_sack_event, sctp_generate_autoclose_event, }; @@ -689,9 +689,9 @@ static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds, * increased due to timer expirations. */ asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = - asoc->ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT]; + asoc->rto_initial; asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = - asoc->ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE]; + asoc->rto_initial; } if (sctp_state(asoc, ESTABLISHED) || diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 505c7de10c5..475bfb4972d 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -5160,6 +5160,8 @@ static int sctp_eat_data(const struct sctp_association *asoc, sctp_verb_t deliver; int tmp; __u32 tsn; + int account_value; + struct sock *sk = asoc->base.sk; data_hdr = chunk->subh.data_hdr = (sctp_datahdr_t *)chunk->skb->data; skb_pull(chunk->skb, sizeof(sctp_datahdr_t)); @@ -5169,6 +5171,26 @@ static int sctp_eat_data(const struct sctp_association *asoc, /* ASSERT: Now skb->data is really the user data. */ + /* + * if we are established, and we have used up our receive + * buffer memory, drop the frame + */ + if (asoc->state == SCTP_STATE_ESTABLISHED) { + /* + * If the receive buffer policy is 1, then each + * association can allocate up to sk_rcvbuf bytes + * otherwise, all the associations in aggregate + * may allocate up to sk_rcvbuf bytes + */ + if (asoc->ep->rcvbuf_policy) + account_value = atomic_read(&asoc->rmem_alloc); + else + account_value = atomic_read(&sk->sk_rmem_alloc); + + if (account_value > sk->sk_rcvbuf) + return SCTP_IERROR_IGNORE_TSN; + } + /* Process ECN based congestion. * * Since the chunk structure is reused for all chunks within diff --git a/net/sctp/socket.c b/net/sctp/socket.c index b529af5e6f2..9df888e932c 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -156,10 +156,6 @@ static inline void sctp_set_owner_w(struct sctp_chunk *chunk) sizeof(struct sk_buff) + sizeof(struct sctp_chunk); - sk->sk_wmem_queued += SCTP_DATA_SNDSIZE(chunk) + - sizeof(struct sk_buff) + - sizeof(struct sctp_chunk); - atomic_add(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc); } @@ -1932,7 +1928,6 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval, if (copy_from_user(&sp->autoclose, optval, optlen)) return -EFAULT; - sp->ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ; return 0; } @@ -3426,7 +3421,7 @@ static int sctp_copy_laddrs_to_user_old(struct sock *sk, __u16 port, int max_add } static int sctp_copy_laddrs_to_user(struct sock *sk, __u16 port, - void * __user *to, size_t space_left) + void __user **to, size_t space_left) { struct list_head *pos; struct sctp_sockaddr_entry *addr; @@ -4427,7 +4422,7 @@ cleanup: * tcp_poll(). Note that, based on these implementations, we don't * lock the socket in this function, even though it seems that, * ideally, locking or some other mechanisms can be used to ensure - * the integrity of the counters (sndbuf and wmem_queued) used + * the integrity of the counters (sndbuf and wmem_alloc) used * in this place. We assume that we don't need locks either until proven * otherwise. * @@ -4744,11 +4739,6 @@ static struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags, struct sk_buff *skb; long timeo; - /* Caller is allowed not to check sk->sk_err before calling. */ - error = sock_error(sk); - if (error) - goto no_packet; - timeo = sock_rcvtimeo(sk, noblock); SCTP_DEBUG_PRINTK("Timeout: timeo: %ld, MAX: %ld.\n", @@ -4775,6 +4765,11 @@ static struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags, if (skb) return skb; + /* Caller is allowed not to check sk->sk_err before calling. */ + error = sock_error(sk); + if (error) + goto no_packet; + if (sk->sk_shutdown & RCV_SHUTDOWN) break; @@ -4834,10 +4829,6 @@ static void sctp_wfree(struct sk_buff *skb) sizeof(struct sk_buff) + sizeof(struct sctp_chunk); - sk->sk_wmem_queued -= SCTP_DATA_SNDSIZE(chunk) + - sizeof(struct sk_buff) + - sizeof(struct sctp_chunk); - atomic_sub(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc); sock_wfree(skb); @@ -4921,7 +4912,7 @@ void sctp_write_space(struct sock *sk) /* Is there any sndbuf space available on the socket? * - * Note that wmem_queued is the sum of the send buffers on all of the + * Note that sk_wmem_alloc is the sum of the send buffers on all of the * associations on the same socket. For a UDP-style socket with * multiple associations, it is possible for it to be "unwriteable" * prematurely. I assume that this is acceptable because @@ -4934,7 +4925,7 @@ static int sctp_writeable(struct sock *sk) { int amt = 0; - amt = sk->sk_sndbuf - sk->sk_wmem_queued; + amt = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc); if (amt < 0) amt = 0; return amt; @@ -5115,8 +5106,10 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, sctp_skb_for_each(skb, &oldsk->sk_receive_queue, tmp) { event = sctp_skb2event(skb); if (event->asoc == assoc) { + sock_rfree(skb); __skb_unlink(skb, &oldsk->sk_receive_queue); __skb_queue_tail(&newsk->sk_receive_queue, skb); + skb_set_owner_r(skb, newsk); } } @@ -5144,8 +5137,10 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, sctp_skb_for_each(skb, &oldsp->pd_lobby, tmp) { event = sctp_skb2event(skb); if (event->asoc == assoc) { + sock_rfree(skb); __skb_unlink(skb, &oldsp->pd_lobby); __skb_queue_tail(queue, skb); + skb_set_owner_r(skb, newsk); } } diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 75b28dd634f..fcd7096c953 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -121,6 +121,14 @@ static ctl_table sctp_table[] = { .proc_handler = &proc_dointvec }, { + .ctl_name = NET_SCTP_RCVBUF_POLICY, + .procname = "rcvbuf_policy", + .data = &sctp_rcvbuf_policy, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { .ctl_name = NET_SCTP_PATH_MAX_RETRANS, .procname = "path_max_retrans", .data = &sctp_max_retrans_path, diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 6bc27200e6c..268ddaf2dc0 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -261,7 +261,8 @@ void sctp_transport_route(struct sctp_transport *transport, * association's active path for getsockname(). */ if (asoc && (transport == asoc->peer.active_path)) - af->to_sk_saddr(&transport->saddr, asoc->base.sk); + opt->pf->af->to_sk_saddr(&transport->saddr, + asoc->base.sk); } else transport->pmtu = SCTP_DEFAULT_MAXSEGMENT; } diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index e049f41faa4..ba97f974f57 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -52,19 +52,6 @@ static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event, struct sctp_association *asoc); static void sctp_ulpevent_release_data(struct sctp_ulpevent *event); -/* Stub skb destructor. */ -static void sctp_stub_rfree(struct sk_buff *skb) -{ -/* WARNING: This function is just a warning not to use the - * skb destructor. If the skb is shared, we may get the destructor - * callback on some processor that does not own the sock_lock. This - * was occuring with PACKET socket applications that were monitoring - * our skbs. We can't take the sock_lock, because we can't risk - * recursing if we do really own the sock lock. Instead, do all - * of our rwnd manipulation while we own the sock_lock outright. - */ -} - /* Initialize an ULP event from an given skb. */ SCTP_STATIC void sctp_ulpevent_init(struct sctp_ulpevent *event, int msg_flags) { @@ -111,15 +98,19 @@ static inline void sctp_ulpevent_set_owner(struct sctp_ulpevent *event, */ sctp_association_hold((struct sctp_association *)asoc); skb = sctp_event2skb(event); - skb->sk = asoc->base.sk; event->asoc = (struct sctp_association *)asoc; - skb->destructor = sctp_stub_rfree; + atomic_add(skb->truesize, &event->asoc->rmem_alloc); + skb_set_owner_r(skb, asoc->base.sk); } /* A simple destructor to give up the reference to the association. */ static inline void sctp_ulpevent_release_owner(struct sctp_ulpevent *event) { - sctp_association_put(event->asoc); + struct sctp_association *asoc = event->asoc; + struct sk_buff *skb = sctp_event2skb(event); + + atomic_sub(skb->truesize, &asoc->rmem_alloc); + sctp_association_put(asoc); } /* Create and initialize an SCTP_ASSOC_CHANGE event. @@ -922,7 +913,6 @@ done: /* Free a ulpevent that has an owner. It includes releasing the reference * to the owner, updating the rwnd in case of a DATA event and freeing the * skb. - * See comments in sctp_stub_rfree(). */ void sctp_ulpevent_free(struct sctp_ulpevent *event) { diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index f44f46f1d8e..8d782282ec1 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -638,7 +638,7 @@ gss_pipe_destroy_msg(struct rpc_pipe_msg *msg) gss_msg); atomic_inc(&gss_msg->count); gss_unhash_msg(gss_msg); - if (msg->errno == -ETIMEDOUT || msg->errno == -EPIPE) { + if (msg->errno == -ETIMEDOUT) { unsigned long now = jiffies; if (time_after(now, ratelimit)) { printk(KERN_WARNING "RPC: AUTH_GSS upcall timed out.\n" @@ -786,7 +786,9 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int taskflags) cred->gc_flags = 0; cred->gc_base.cr_ops = &gss_credops; cred->gc_service = gss_auth->service; - err = gss_create_upcall(gss_auth, cred); + do { + err = gss_create_upcall(gss_auth, cred); + } while (err == -EAGAIN); if (err < 0) goto out_err; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 81e00a6c19d..16a2458f38f 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -39,23 +39,26 @@ static kmem_cache_t *rpc_inode_cachep __read_mostly; #define RPC_UPCALL_TIMEOUT (30*HZ) static void -__rpc_purge_upcall(struct inode *inode, int err) +__rpc_purge_list(struct rpc_inode *rpci, struct list_head *head, int err) { - struct rpc_inode *rpci = RPC_I(inode); struct rpc_pipe_msg *msg; + void (*destroy_msg)(struct rpc_pipe_msg *); - while (!list_empty(&rpci->pipe)) { - msg = list_entry(rpci->pipe.next, struct rpc_pipe_msg, list); - list_del_init(&msg->list); - msg->errno = err; - rpci->ops->destroy_msg(msg); - } - while (!list_empty(&rpci->in_upcall)) { - msg = list_entry(rpci->pipe.next, struct rpc_pipe_msg, list); + destroy_msg = rpci->ops->destroy_msg; + while (!list_empty(head)) { + msg = list_entry(head->next, struct rpc_pipe_msg, list); list_del_init(&msg->list); msg->errno = err; - rpci->ops->destroy_msg(msg); + destroy_msg(msg); } +} + +static void +__rpc_purge_upcall(struct inode *inode, int err) +{ + struct rpc_inode *rpci = RPC_I(inode); + + __rpc_purge_list(rpci, &rpci->pipe, err); rpci->pipelen = 0; wake_up(&rpci->waitq); } @@ -115,6 +118,7 @@ rpc_close_pipes(struct inode *inode) down(&inode->i_sem); if (rpci->ops != NULL) { rpci->nreaders = 0; + __rpc_purge_list(rpci, &rpci->in_upcall, -EPIPE); __rpc_purge_upcall(inode, -EPIPE); rpci->nwriters = 0; if (rpci->ops->release_pipe) @@ -170,7 +174,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp) goto out; msg = (struct rpc_pipe_msg *)filp->private_data; if (msg != NULL) { - msg->errno = -EPIPE; + msg->errno = -EAGAIN; list_del_init(&msg->list); rpci->ops->destroy_msg(msg); } @@ -179,7 +183,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp) if (filp->f_mode & FMODE_READ) rpci->nreaders --; if (!rpci->nreaders) - __rpc_purge_upcall(inode, -EPIPE); + __rpc_purge_upcall(inode, -EAGAIN); if (rpci->ops->release_pipe) rpci->ops->release_pipe(inode); out: diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c index 8f97e90f36c..eb330d4f66d 100644 --- a/net/sunrpc/socklib.c +++ b/net/sunrpc/socklib.c @@ -6,6 +6,9 @@ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */ +#include <linux/compiler.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> #include <linux/types.h> #include <linux/pagemap.h> #include <linux/udp.h> @@ -165,6 +168,8 @@ int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) return -1; if ((unsigned short)csum_fold(desc.csum)) return -1; + if (unlikely(skb->ip_summed == CHECKSUM_HW)) + netdev_rx_csum_fault(skb->dev); return 0; no_checksum: if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits) < 0) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index f16e7cdd615..c6a51911e71 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -623,12 +623,9 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) /* we can use it in-place */ rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr); rqstp->rq_arg.head[0].iov_len = len; - if (skb->ip_summed != CHECKSUM_UNNECESSARY) { - if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { - skb_free_datagram(svsk->sk_sk, skb); - return 0; - } - skb->ip_summed = CHECKSUM_UNNECESSARY; + if (skb_checksum_complete(skb)) { + skb_free_datagram(svsk->sk_sk, skb); + return 0; } rqstp->rq_skbuff = skb; } @@ -1181,6 +1178,7 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout) arg->tail[0].iov_len = 0; try_to_freeze(); + cond_resched(); if (signalled()) return -EINTR; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 0a51fd46a84..77e8800d412 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -990,6 +990,7 @@ static void xs_udp_connect_worker(void *args) sk->sk_data_ready = xs_udp_data_ready; sk->sk_write_space = xs_udp_write_space; sk->sk_no_check = UDP_CSUM_NORCV; + sk->sk_allocation = GFP_ATOMIC; xprt_set_connected(xprt); @@ -1074,6 +1075,7 @@ static void xs_tcp_connect_worker(void *args) sk->sk_data_ready = xs_tcp_data_ready; sk->sk_state_change = xs_tcp_state_change; sk->sk_write_space = xs_tcp_write_space; + sk->sk_allocation = GFP_ATOMIC; /* socket options */ sk->sk_userlocks |= SOCK_BINDPORT_LOCK; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 0db9e57013f..d19e274b9c4 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -346,6 +346,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) struct xfrm_policy *pol, **p; struct xfrm_policy *delpol = NULL; struct xfrm_policy **newpos = NULL; + struct dst_entry *gc_list; write_lock_bh(&xfrm_policy_lock); for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL;) { @@ -381,9 +382,36 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) xfrm_pol_hold(policy); write_unlock_bh(&xfrm_policy_lock); - if (delpol) { + if (delpol) xfrm_policy_kill(delpol); + + read_lock_bh(&xfrm_policy_lock); + gc_list = NULL; + for (policy = policy->next; policy; policy = policy->next) { + struct dst_entry *dst; + + write_lock(&policy->lock); + dst = policy->bundles; + if (dst) { + struct dst_entry *tail = dst; + while (tail->next) + tail = tail->next; + tail->next = gc_list; + gc_list = dst; + + policy->bundles = NULL; + } + write_unlock(&policy->lock); + } + read_unlock_bh(&xfrm_policy_lock); + + while (gc_list) { + struct dst_entry *dst = gc_list; + + gc_list = dst->next; + dst_free(dst); } + return 0; } EXPORT_SYMBOL(xfrm_policy_insert); @@ -1014,13 +1042,12 @@ int __xfrm_route_forward(struct sk_buff *skb, unsigned short family) } EXPORT_SYMBOL(__xfrm_route_forward); -/* Optimize later using cookies and generation ids. */ - static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie) { - if (!stale_bundle(dst)) - return dst; - + /* If it is marked obsolete, which is how we even get here, + * then we have purged it from the policy bundle list and we + * did that for a good reason. + */ return NULL; } @@ -1104,6 +1131,16 @@ int xfrm_flush_bundles(void) return 0; } +static int always_true(struct dst_entry *dst) +{ + return 1; +} + +void xfrm_flush_all_bundles(void) +{ + xfrm_prune_bundles(always_true); +} + void xfrm_init_pmtu(struct dst_entry *dst) { do { diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 7cf48aa6c95..479effc9766 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -431,6 +431,8 @@ void xfrm_state_insert(struct xfrm_state *x) spin_lock_bh(&xfrm_state_lock); __xfrm_state_insert(x); spin_unlock_bh(&xfrm_state_lock); + + xfrm_flush_all_bundles(); } EXPORT_SYMBOL(xfrm_state_insert); @@ -478,6 +480,9 @@ out: spin_unlock_bh(&xfrm_state_lock); xfrm_state_put_afinfo(afinfo); + if (!err) + xfrm_flush_all_bundles(); + if (x1) { xfrm_state_delete(x1); xfrm_state_put(x1); |