diff options
Diffstat (limited to 'net')
41 files changed, 639 insertions, 326 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 91e412b0ab0..67465b65abe 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -753,6 +753,8 @@ static int vlan_ioctl_handler(void __user *arg) break; case GET_VLAN_REALDEV_NAME_CMD: err = vlan_dev_get_realdev_name(args.device1, args.u.device2); + if (err) + goto out; if (copy_to_user(arg, &args, sizeof(struct vlan_ioctl_args))) { err = -EFAULT; @@ -761,6 +763,8 @@ static int vlan_ioctl_handler(void __user *arg) case GET_VLAN_VID_CMD: err = vlan_dev_get_vid(args.device1, &vid); + if (err) + goto out; args.u.VID = vid; if (copy_to_user(arg, &args, sizeof(struct vlan_ioctl_args))) { @@ -774,7 +778,7 @@ static int vlan_ioctl_handler(void __user *arg) __FUNCTION__, args.cmd); return -EINVAL; }; - +out: return err; } diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index b7486488967..f2a8750bbf1 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -165,6 +165,9 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev, skb_pull(skb, VLAN_HLEN); /* take off the VLAN header (4 bytes currently) */ + /* Need to correct hardware checksum */ + skb_postpull_rcsum(skb, vhdr, VLAN_HLEN); + /* Ok, lets check to make sure the device (dev) we * came in on is what this VLAN is attached to. */ diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index d8e36b77512..23422bd53a5 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -295,7 +295,7 @@ static int check_hbh_len(struct sk_buff *skb) len -= 2; while (len > 0) { - int optlen = raw[off+1]+2; + int optlen = skb->nh.raw[off+1]+2; switch (skb->nh.raw[off]) { case IPV6_TLV_PAD0: @@ -308,18 +308,15 @@ static int check_hbh_len(struct sk_buff *skb) case IPV6_TLV_JUMBO: if (skb->nh.raw[off+1] != 4 || (off&3) != 2) goto bad; - pkt_len = ntohl(*(u32*)(skb->nh.raw+off+2)); - + if (pkt_len <= IPV6_MAXPLEN || + skb->nh.ipv6h->payload_len) + goto bad; if (pkt_len > skb->len - sizeof(struct ipv6hdr)) goto bad; - if (pkt_len + sizeof(struct ipv6hdr) < skb->len) { - if (__pskb_trim(skb, - pkt_len + sizeof(struct ipv6hdr))) - goto bad; - if (skb->ip_summed == CHECKSUM_HW) - skb->ip_summed = CHECKSUM_NONE; - } + if (pskb_trim_rcsum(skb, + pkt_len+sizeof(struct ipv6hdr))) + goto bad; break; default: if (optlen > len) @@ -372,6 +369,7 @@ static unsigned int br_nf_pre_routing_ipv6(unsigned int hook, if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb)) goto inhdr_error; + nf_bridge_put(skb->nf_bridge); if ((nf_bridge = nf_bridge_alloc(skb)) == NULL) return NF_DROP; setup_pre_routing(skb); @@ -455,6 +453,7 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff **pskb, skb->ip_summed = CHECKSUM_NONE; } + nf_bridge_put(skb->nf_bridge); if ((nf_bridge = nf_bridge_alloc(skb)) == NULL) return NF_DROP; setup_pre_routing(skb); diff --git a/net/core/dev.c b/net/core/dev.c index 0b48e294aaf..a5efc9ae010 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1113,7 +1113,8 @@ out: void netdev_rx_csum_fault(struct net_device *dev) { if (net_ratelimit()) { - printk(KERN_ERR "%s: hw csum failure.\n", dev->name); + printk(KERN_ERR "%s: hw csum failure.\n", + dev ? dev->name : "<unknown>"); dump_stack(); } } diff --git a/net/core/filter.c b/net/core/filter.c index 2841bfce29d..3a10e0bc90e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -293,7 +293,7 @@ int sk_chk_filter(struct sock_filter *filter, int flen) struct sock_filter *ftest; int pc; - if (((unsigned int)flen >= (~0U / sizeof(struct sock_filter))) || flen == 0) + if (flen == 0 || flen > BPF_MAXINSNS) return -EINVAL; /* check the filter code now */ @@ -360,7 +360,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) int err; /* Make sure new filter is there and in the right amounts. */ - if (fprog->filter == NULL || fprog->len > BPF_MAXINSNS) + if (fprog->filter == NULL) return -EINVAL; fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b7d13a4fff4..83fee37de38 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1725,7 +1725,7 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, * of the skb if any page alloc fails user this procedure returns -ENOMEM */ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, - int getfrag(void *from, char *to, int offset, + int (*getfrag)(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length) { diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index ca03521112c..656e13e38cf 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -1251,7 +1251,7 @@ static int dccp_v4_destroy_sock(struct sock *sk) struct dccp_sock *dp = dccp_sk(sk); /* - * DCCP doesn't use sk_qrite_queue, just sk_send_head + * DCCP doesn't use sk_write_queue, just sk_send_head * for retransmissions */ if (sk->sk_send_head != NULL) { diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index f89e55f814d..d402e9020c6 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -153,6 +153,7 @@ static struct proto_ops dn_proto_ops; static DEFINE_RWLOCK(dn_hash_lock); static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE]; static struct hlist_head dn_wild_sk; +static atomic_t decnet_memory_allocated; static int __dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen, int flags); static int __dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen, int flags); @@ -446,10 +447,26 @@ static void dn_destruct(struct sock *sk) dst_release(xchg(&sk->sk_dst_cache, NULL)); } +static int dn_memory_pressure; + +static void dn_enter_memory_pressure(void) +{ + if (!dn_memory_pressure) { + dn_memory_pressure = 1; + } +} + static struct proto dn_proto = { - .name = "DECNET", - .owner = THIS_MODULE, - .obj_size = sizeof(struct dn_sock), + .name = "NSP", + .owner = THIS_MODULE, + .enter_memory_pressure = dn_enter_memory_pressure, + .memory_pressure = &dn_memory_pressure, + .memory_allocated = &decnet_memory_allocated, + .sysctl_mem = sysctl_decnet_mem, + .sysctl_wmem = sysctl_decnet_wmem, + .sysctl_rmem = sysctl_decnet_rmem, + .max_header = DN_MAX_NSP_DATA_HEADER + 64, + .obj_size = sizeof(struct dn_sock), }; static struct sock *dn_alloc_sock(struct socket *sock, gfp_t gfp) @@ -470,6 +487,8 @@ static struct sock *dn_alloc_sock(struct socket *sock, gfp_t gfp) sk->sk_family = PF_DECnet; sk->sk_protocol = 0; sk->sk_allocation = gfp; + sk->sk_sndbuf = sysctl_decnet_wmem[1]; + sk->sk_rcvbuf = sysctl_decnet_rmem[1]; /* Initialization of DECnet Session Control Port */ scp = DN_SK(sk); diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c index 02bca49cb50..0e9d2c57116 100644 --- a/net/decnet/sysctl_net_decnet.c +++ b/net/decnet/sysctl_net_decnet.c @@ -10,6 +10,7 @@ * * Changes: * Steve Whitehouse - C99 changes and default device handling + * Steve Whitehouse - Memory buffer settings, like the tcp ones * */ #include <linux/config.h> @@ -37,6 +38,11 @@ int decnet_dr_count = 3; int decnet_log_martians = 1; int decnet_no_fc_max_cwnd = NSP_MIN_WINDOW; +/* Reasonable defaults, I hope, based on tcp's defaults */ +int sysctl_decnet_mem[3] = { 768 << 3, 1024 << 3, 1536 << 3 }; +int sysctl_decnet_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 }; +int sysctl_decnet_rmem[3] = { 4 * 1024, 87380, 87380 * 2 }; + #ifdef CONFIG_SYSCTL extern int decnet_dst_gc_interval; static int min_decnet_time_wait[] = { 5 }; @@ -428,6 +434,33 @@ static ctl_table dn_table[] = { .extra1 = &min_decnet_no_fc_max_cwnd, .extra2 = &max_decnet_no_fc_max_cwnd }, + { + .ctl_name = NET_DECNET_MEM, + .procname = "decnet_mem", + .data = &sysctl_decnet_mem, + .maxlen = sizeof(sysctl_decnet_mem), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = NET_DECNET_RMEM, + .procname = "decnet_rmem", + .data = &sysctl_decnet_rmem, + .maxlen = sizeof(sysctl_decnet_rmem), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = NET_DECNET_WMEM, + .procname = "decnet_wmem", + .data = &sysctl_decnet_wmem, + .maxlen = sizeof(sysctl_decnet_wmem), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, { .ctl_name = NET_DECNET_DEBUG_LEVEL, .procname = "debug", diff --git a/net/ieee80211/Kconfig b/net/ieee80211/Kconfig index 91b16fbf91f..d18ccba3ea9 100644 --- a/net/ieee80211/Kconfig +++ b/net/ieee80211/Kconfig @@ -55,7 +55,7 @@ config IEEE80211_CRYPT_CCMP config IEEE80211_CRYPT_TKIP tristate "IEEE 802.11i TKIP encryption" - depends on IEEE80211 + depends on IEEE80211 && NET_RADIO select CRYPTO select CRYPTO_MICHAEL_MIC ---help--- diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index a4c347c3b8e..46f9d9cf7a5 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -618,7 +618,7 @@ static int ipgre_rcv(struct sk_buff *skb) skb->mac.raw = skb->nh.raw; skb->nh.raw = __pskb_pull(skb, offset); - skb_postpull_rcsum(skb, skb->mac.raw, offset); + skb_postpull_rcsum(skb, skb->h.raw, offset); memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); skb->pkt_type = PACKET_HOST; #ifdef CONFIG_NET_IPGRE_BROADCAST diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 0bc00528d88..88a60650e6b 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -56,8 +56,8 @@ config IP_NF_CONNTRACK_MARK instead of the individual packets. config IP_NF_CONNTRACK_EVENTS - bool "Connection tracking events" - depends on IP_NF_CONNTRACK + bool "Connection tracking events (EXPERIMENTAL)" + depends on EXPERIMENTAL && IP_NF_CONNTRACK help If this option is enabled, the connection tracking code will provide a notifier chain that can be used by other kernel code @@ -66,8 +66,8 @@ config IP_NF_CONNTRACK_EVENTS IF unsure, say `N'. config IP_NF_CONNTRACK_NETLINK - tristate 'Connection tracking netlink interface' - depends on IP_NF_CONNTRACK && NETFILTER_NETLINK + tristate 'Connection tracking netlink interface (EXPERIMENTAL)' + depends on EXPERIMENTAL && IP_NF_CONNTRACK && NETFILTER_NETLINK depends on IP_NF_CONNTRACK!=y || NETFILTER_NETLINK!=m help This option enables support for a netlink-based userspace interface diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 058c48e258f..d0a447e520a 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -12,6 +12,7 @@ ip_nat_pptp-objs := ip_nat_helper_pptp.o ip_nat_proto_gre.o # connection tracking obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o +obj-$(CONFIG_IP_NF_NAT) += ip_nat.o # conntrack netlink interface obj-$(CONFIG_IP_NF_CONNTRACK_NETLINK) += ip_conntrack_netlink.o @@ -41,7 +42,7 @@ obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o # the three instances of ip_tables obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o -obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o ip_nat.o +obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o # matches diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 7a4ecddd597..84c66dbfeda 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -1345,6 +1345,11 @@ static int kill_all(struct ip_conntrack *i, void *data) return 1; } +void ip_conntrack_flush(void) +{ + ip_ct_iterate_cleanup(kill_all, NULL); +} + static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size) { if (vmalloced) @@ -1354,8 +1359,12 @@ static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size) get_order(sizeof(struct list_head) * size)); } -void ip_conntrack_flush(void) +/* Mishearing the voices in his head, our hero wonders how he's + supposed to kill the mall. */ +void ip_conntrack_cleanup(void) { + ip_ct_attach = NULL; + /* This makes sure all current packets have passed through netfilter framework. Roll on, two-stage module delete... */ @@ -1363,7 +1372,7 @@ void ip_conntrack_flush(void) ip_ct_event_cache_flush(); i_see_dead_people: - ip_ct_iterate_cleanup(kill_all, NULL); + ip_conntrack_flush(); if (atomic_read(&ip_conntrack_count) != 0) { schedule(); goto i_see_dead_people; @@ -1371,14 +1380,7 @@ void ip_conntrack_flush(void) /* wait until all references to ip_conntrack_untracked are dropped */ while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1) schedule(); -} -/* Mishearing the voices in his head, our hero wonders how he's - supposed to kill the mall. */ -void ip_conntrack_cleanup(void) -{ - ip_ct_attach = NULL; - ip_conntrack_flush(); kmem_cache_destroy(ip_conntrack_cachep); kmem_cache_destroy(ip_conntrack_expect_cachep); free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc, diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 3fce91bcc0b..91fe8f2e38f 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -503,7 +503,7 @@ ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple) } static const size_t cta_min_proto[CTA_PROTO_MAX] = { - [CTA_PROTO_NUM-1] = sizeof(u_int16_t), + [CTA_PROTO_NUM-1] = sizeof(u_int8_t), [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t), [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t), [CTA_PROTO_ICMP_TYPE-1] = sizeof(u_int8_t), @@ -528,7 +528,7 @@ ctnetlink_parse_tuple_proto(struct nfattr *attr, if (!tb[CTA_PROTO_NUM-1]) return -EINVAL; - tuple->dst.protonum = *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]); + tuple->dst.protonum = *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]); proto = ip_conntrack_proto_find_get(tuple->dst.protonum); @@ -728,11 +728,9 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, return -ENOENT; } } - if (del_timer(&ct->timeout)) { - ip_conntrack_put(ct); + if (del_timer(&ct->timeout)) ct->timeout.function((unsigned long)ct); - return 0; - } + ip_conntrack_put(ct); DEBUGP("leaving\n"); @@ -877,7 +875,7 @@ ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[]) DEBUGP("NAT status: %lu\n", status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); - if (ip_nat_initialized(ct, hooknum)) + if (ip_nat_initialized(ct, HOOK2MANIP(hooknum))) return -EEXIST; ip_nat_setup_info(ct, &range, hooknum); diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index aeb7353d477..e7fa29e576d 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -341,9 +341,10 @@ static int tcp_print_conntrack(struct seq_file *s, static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa, const struct ip_conntrack *ct) { - struct nfattr *nest_parms = NFA_NEST(skb, CTA_PROTOINFO_TCP); + struct nfattr *nest_parms; read_lock_bh(&tcp_lock); + nest_parms = NFA_NEST(skb, CTA_PROTOINFO_TCP); NFA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t), &ct->proto.tcp.state); read_unlock_bh(&tcp_lock); diff --git a/net/ipv4/netfilter/ip_nat_tftp.c b/net/ipv4/netfilter/ip_nat_tftp.c index 2215317c76b..43c3bd7c118 100644 --- a/net/ipv4/netfilter/ip_nat_tftp.c +++ b/net/ipv4/netfilter/ip_nat_tftp.c @@ -42,7 +42,10 @@ static unsigned int help(struct sk_buff **pskb, enum ip_conntrack_info ctinfo, struct ip_conntrack_expect *exp) { - exp->saved_proto.udp.port = exp->tuple.dst.u.tcp.port; + struct ip_conntrack *ct = exp->master; + + exp->saved_proto.udp.port + = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port; exp->dir = IP_CT_DIR_REPLY; exp->expectfn = ip_nat_follow_master; if (ip_conntrack_expect_related(exp) != 0) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 029c70dfb58..b7325e0b406 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -262,122 +262,139 @@ static __inline__ u16 tcp_select_window(struct sock *sk) * We are working here with either a clone of the original * SKB, or a fresh unique copy made by the retransmit engine. */ -static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) +static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask) { - if (skb != NULL) { - const struct inet_connection_sock *icsk = inet_csk(sk); - struct inet_sock *inet = inet_sk(sk); - struct tcp_sock *tp = tcp_sk(sk); - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - int tcp_header_size = tp->tcp_header_len; - struct tcphdr *th; - int sysctl_flags; - int err; + const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_sock *inet; + struct tcp_sock *tp; + struct tcp_skb_cb *tcb; + int tcp_header_size; + struct tcphdr *th; + int sysctl_flags; + int err; + + BUG_ON(!skb || !tcp_skb_pcount(skb)); + + /* If congestion control is doing timestamping, we must + * take such a timestamp before we potentially clone/copy. + */ + if (icsk->icsk_ca_ops->rtt_sample) + __net_timestamp(skb); + + if (likely(clone_it)) { + if (unlikely(skb_cloned(skb))) + skb = pskb_copy(skb, gfp_mask); + else + skb = skb_clone(skb, gfp_mask); + if (unlikely(!skb)) + return -ENOBUFS; + } - BUG_ON(!tcp_skb_pcount(skb)); + inet = inet_sk(sk); + tp = tcp_sk(sk); + tcb = TCP_SKB_CB(skb); + tcp_header_size = tp->tcp_header_len; #define SYSCTL_FLAG_TSTAMPS 0x1 #define SYSCTL_FLAG_WSCALE 0x2 #define SYSCTL_FLAG_SACK 0x4 - /* If congestion control is doing timestamping */ - if (icsk->icsk_ca_ops->rtt_sample) - __net_timestamp(skb); - - sysctl_flags = 0; - if (tcb->flags & TCPCB_FLAG_SYN) { - tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; - if(sysctl_tcp_timestamps) { - tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; - sysctl_flags |= SYSCTL_FLAG_TSTAMPS; - } - if(sysctl_tcp_window_scaling) { - tcp_header_size += TCPOLEN_WSCALE_ALIGNED; - sysctl_flags |= SYSCTL_FLAG_WSCALE; - } - if(sysctl_tcp_sack) { - sysctl_flags |= SYSCTL_FLAG_SACK; - if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS)) - tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; - } - } else if (tp->rx_opt.eff_sacks) { - /* A SACK is 2 pad bytes, a 2 byte header, plus - * 2 32-bit sequence numbers for each SACK block. - */ - tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED + - (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); + sysctl_flags = 0; + if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { + tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; + if(sysctl_tcp_timestamps) { + tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; + sysctl_flags |= SYSCTL_FLAG_TSTAMPS; } - - if (tcp_packets_in_flight(tp) == 0) - tcp_ca_event(sk, CA_EVENT_TX_START); - - th = (struct tcphdr *) skb_push(skb, tcp_header_size); - skb->h.th = th; - skb_set_owner_w(skb, sk); - - /* Build TCP header and checksum it. */ - th->source = inet->sport; - th->dest = inet->dport; - th->seq = htonl(tcb->seq); - th->ack_seq = htonl(tp->rcv_nxt); - *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags); - if (tcb->flags & TCPCB_FLAG_SYN) { - /* RFC1323: The window in SYN & SYN/ACK segments - * is never scaled. - */ - th->window = htons(tp->rcv_wnd); - } else { - th->window = htons(tcp_select_window(sk)); + if (sysctl_tcp_window_scaling) { + tcp_header_size += TCPOLEN_WSCALE_ALIGNED; + sysctl_flags |= SYSCTL_FLAG_WSCALE; } - th->check = 0; - th->urg_ptr = 0; - - if (tp->urg_mode && - between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) { - th->urg_ptr = htons(tp->snd_up-tcb->seq); - th->urg = 1; + if (sysctl_tcp_sack) { + sysctl_flags |= SYSCTL_FLAG_SACK; + if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS)) + tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; } + } else if (unlikely(tp->rx_opt.eff_sacks)) { + /* A SACK is 2 pad bytes, a 2 byte header, plus + * 2 32-bit sequence numbers for each SACK block. + */ + tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED + + (tp->rx_opt.eff_sacks * + TCPOLEN_SACK_PERBLOCK)); + } + + if (tcp_packets_in_flight(tp) == 0) + tcp_ca_event(sk, CA_EVENT_TX_START); + + th = (struct tcphdr *) skb_push(skb, tcp_header_size); + skb->h.th = th; + skb_set_owner_w(skb, sk); + + /* Build TCP header and checksum it. */ + th->source = inet->sport; + th->dest = inet->dport; + th->seq = htonl(tcb->seq); + th->ack_seq = htonl(tp->rcv_nxt); + *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | + tcb->flags); + + if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { + /* RFC1323: The window in SYN & SYN/ACK segments + * is never scaled. + */ + th->window = htons(tp->rcv_wnd); + } else { + th->window = htons(tcp_select_window(sk)); + } + th->check = 0; + th->urg_ptr = 0; - if (tcb->flags & TCPCB_FLAG_SYN) { - tcp_syn_build_options((__u32 *)(th + 1), - tcp_advertise_mss(sk), - (sysctl_flags & SYSCTL_FLAG_TSTAMPS), - (sysctl_flags & SYSCTL_FLAG_SACK), - (sysctl_flags & SYSCTL_FLAG_WSCALE), - tp->rx_opt.rcv_wscale, - tcb->when, - tp->rx_opt.ts_recent); - } else { - tcp_build_and_update_options((__u32 *)(th + 1), - tp, tcb->when); + if (unlikely(tp->urg_mode && + between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF))) { + th->urg_ptr = htons(tp->snd_up-tcb->seq); + th->urg = 1; + } - TCP_ECN_send(sk, tp, skb, tcp_header_size); - } - tp->af_specific->send_check(sk, th, skb->len, skb); + if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { + tcp_syn_build_options((__u32 *)(th + 1), + tcp_advertise_mss(sk), + (sysctl_flags & SYSCTL_FLAG_TSTAMPS), + (sysctl_flags & SYSCTL_FLAG_SACK), + (sysctl_flags & SYSCTL_FLAG_WSCALE), + tp->rx_opt.rcv_wscale, + tcb->when, + tp->rx_opt.ts_recent); + } else { + tcp_build_and_update_options((__u32 *)(th + 1), + tp, tcb->when); + TCP_ECN_send(sk, tp, skb, tcp_header_size); + } - if (tcb->flags & TCPCB_FLAG_ACK) - tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); + tp->af_specific->send_check(sk, th, skb->len, skb); - if (skb->len != tcp_header_size) - tcp_event_data_sent(tp, skb, sk); + if (likely(tcb->flags & TCPCB_FLAG_ACK)) + tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); - TCP_INC_STATS(TCP_MIB_OUTSEGS); + if (skb->len != tcp_header_size) + tcp_event_data_sent(tp, skb, sk); - err = tp->af_specific->queue_xmit(skb, 0); - if (err <= 0) - return err; + TCP_INC_STATS(TCP_MIB_OUTSEGS); - tcp_enter_cwr(sk); + err = tp->af_specific->queue_xmit(skb, 0); + if (unlikely(err <= 0)) + return err; + + tcp_enter_cwr(sk); + + /* NET_XMIT_CN is special. It does not guarantee, + * that this packet is lost. It tells that device + * is about to start to drop packets or already + * drops some packets of the same priority and + * invokes us to send less aggressively. + */ + return err == NET_XMIT_CN ? 0 : err; - /* NET_XMIT_CN is special. It does not guarantee, - * that this packet is lost. It tells that device - * is about to start to drop packets or already - * drops some packets of the same priority and - * invokes us to send less aggressively. - */ - return err == NET_XMIT_CN ? 0 : err; - } - return -ENOBUFS; #undef SYSCTL_FLAG_TSTAMPS #undef SYSCTL_FLAG_WSCALE #undef SYSCTL_FLAG_SACK @@ -1036,7 +1053,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) TCP_SKB_CB(skb)->when = tcp_time_stamp; - if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))) + if (unlikely(tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC))) break; /* Advance the send_head. This one is sent out. @@ -1109,7 +1126,7 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now) /* Send it out now. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; - if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) { + if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) { update_send_head(sk, tp, skb); tcp_cwnd_validate(sk, tp); return; @@ -1429,9 +1446,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) */ TCP_SKB_CB(skb)->when = tcp_time_stamp; - err = tcp_transmit_skb(sk, (skb_cloned(skb) ? - pskb_copy(skb, GFP_ATOMIC): - skb_clone(skb, GFP_ATOMIC))); + err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); if (err == 0) { /* Update global TCP statistics. */ @@ -1665,7 +1680,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp); TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(skb)->when = tcp_time_stamp; - if (tcp_transmit_skb(sk, skb)) + if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); } @@ -1700,7 +1715,7 @@ int tcp_send_synack(struct sock *sk) TCP_ECN_send_synack(tcp_sk(sk), skb); } TCP_SKB_CB(skb)->when = tcp_time_stamp; - return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); + return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } /* @@ -1861,7 +1876,7 @@ int tcp_connect(struct sock *sk) __skb_queue_tail(&sk->sk_write_queue, buff); sk_charge_skb(sk, buff); tp->packets_out += tcp_skb_pcount(buff); - tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL)); + tcp_transmit_skb(sk, buff, 1, GFP_KERNEL); TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ @@ -1957,7 +1972,7 @@ void tcp_send_ack(struct sock *sk) /* Send it off, this clears delayed acks for us. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp); TCP_SKB_CB(buff)->when = tcp_time_stamp; - tcp_transmit_skb(sk, buff); + tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC); } } @@ -1997,7 +2012,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(skb)->when = tcp_time_stamp; - return tcp_transmit_skb(sk, skb); + return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); } int tcp_write_wakeup(struct sock *sk) @@ -2030,7 +2045,7 @@ int tcp_write_wakeup(struct sock *sk) TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; TCP_SKB_CB(skb)->when = tcp_time_stamp; - err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); + err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); if (!err) { update_send_head(sk, tp, skb); } diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index b7d296a8ac6..13e7e6e8df1 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -215,14 +215,6 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, vegas->beg_snd_nxt = tp->snd_nxt; vegas->beg_snd_cwnd = tp->snd_cwnd; - /* Take into account the current RTT sample too, to - * decrease the impact of delayed acks. This double counts - * this sample since we count it for the next window as well, - * but that's not too awful, since we're taking the min, - * rather than averaging. - */ - tcp_vegas_rtt_calc(sk, seq_rtt * 1000); - /* We do the Vegas calculations only if we got enough RTT * samples that we can be reasonably sure that we got * at least one RTT sample that wasn't from a delayed ACK. @@ -333,11 +325,11 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, else if (tp->snd_cwnd > tp->snd_cwnd_clamp) tp->snd_cwnd = tp->snd_cwnd_clamp; } - } - /* Wipe the slate clean for the next RTT. */ - vegas->cntRTT = 0; - vegas->minRTT = 0x7fffffff; + /* Wipe the slate clean for the next RTT. */ + vegas->cntRTT = 0; + vegas->minRTT = 0x7fffffff; + } } /* Extract info for Tcp socket info provided via netlink. */ diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index b2b60f3e9cd..42196ba3b0b 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -182,6 +182,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl) case IPPROTO_UDP: case IPPROTO_TCP: case IPPROTO_SCTP: + case IPPROTO_DCCP: if (pskb_may_pull(skb, xprth + 4 - skb->data)) { u16 *ports = (u16 *)xprth; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 76ff9f4fe89..a60585fd85a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -137,6 +137,7 @@ static int addrconf_ifdown(struct net_device *dev, int how); static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags); static void addrconf_dad_timer(unsigned long data); static void addrconf_dad_completed(struct inet6_ifaddr *ifp); +static void addrconf_dad_run(struct inet6_dev *idev); static void addrconf_rs_timer(unsigned long data); static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); @@ -379,8 +380,8 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev) dev->type == ARPHRD_NONE || dev->type == ARPHRD_SIT) { printk(KERN_INFO - "Disabled Privacy Extensions on device %p(%s)\n", - dev, dev->name); + "%s: Disabled Privacy Extensions\n", + dev->name); ndev->cnf.use_tempaddr = -1; } else { in6_dev_hold(ndev); @@ -388,6 +389,9 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev) } #endif + if (netif_carrier_ok(dev)) + ndev->if_flags |= IF_READY; + write_lock_bh(&addrconf_lock); dev->ip6_ptr = ndev; write_unlock_bh(&addrconf_lock); @@ -415,6 +419,7 @@ static struct inet6_dev * ipv6_find_idev(struct net_device *dev) if ((idev = ipv6_add_dev(dev)) == NULL) return NULL; } + if (dev->flags&IFF_UP) ipv6_mc_up(idev); return idev; @@ -634,8 +639,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp) } #endif - for (ifap = &idev->addr_list; (ifa=*ifap) != NULL; - ifap = &ifa->if_next) { + for (ifap = &idev->addr_list; (ifa=*ifap) != NULL;) { if (ifa == ifp) { *ifap = ifa->if_next; __in6_ifa_put(ifp); @@ -643,6 +647,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp) if (!(ifp->flags & IFA_F_PERMANENT) || onlink > 0) break; deleted = 1; + continue; } else if (ifp->flags & IFA_F_PERMANENT) { if (ipv6_prefix_equal(&ifa->addr, &ifp->addr, ifp->prefix_len)) { @@ -666,6 +671,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp) } } } + ifap = &ifa->if_next; } write_unlock_bh(&idev->lock); @@ -903,11 +909,18 @@ int ipv6_dev_get_saddr(struct net_device *daddr_dev, score.addr_type = __ipv6_addr_type(&ifa->addr); - /* Rule 0: Candidate Source Address (section 4) + /* Rule 0: + * - Tentative Address (RFC2462 section 5.4) + * - A tentative address is not considered + * "assigned to an interface" in the traditional + * sense. + * - Candidate Source Address (section 4) * - In any case, anycast addresses, multicast * addresses, and the unspecified address MUST * NOT be included in a candidate set. */ + if (ifa->flags & IFA_F_TENTATIVE) + continue; if (unlikely(score.addr_type == IPV6_ADDR_ANY || score.addr_type & IPV6_ADDR_MULTICAST)) { LIMIT_NETDEBUG(KERN_DEBUG @@ -1215,10 +1228,8 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) /* Gets referenced address, destroys ifaddr */ -void addrconf_dad_failure(struct inet6_ifaddr *ifp) +void addrconf_dad_stop(struct inet6_ifaddr *ifp) { - if (net_ratelimit()) - printk(KERN_INFO "%s: duplicate address detected!\n", ifp->idev->dev->name); if (ifp->flags&IFA_F_PERMANENT) { spin_lock_bh(&ifp->lock); addrconf_del_timer(ifp); @@ -1244,6 +1255,12 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp) ipv6_del_addr(ifp); } +void addrconf_dad_failure(struct inet6_ifaddr *ifp) +{ + if (net_ratelimit()) + printk(KERN_INFO "%s: duplicate address detected!\n", ifp->idev->dev->name); + addrconf_dad_stop(ifp); +} /* Join to solicited addr multicast group. */ @@ -1596,9 +1613,17 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len) not good. */ if (valid_lft >= 0x7FFFFFFF/HZ) - rt_expires = 0; + rt_expires = 0x7FFFFFFF - (0x7FFFFFFF % HZ); else - rt_expires = jiffies + valid_lft * HZ; + rt_expires = valid_lft * HZ; + + /* + * We convert this (in jiffies) to clock_t later. + * Avoid arithmetic overflow there as well. + * Overflow can happen only if HZ < USER_HZ. + */ + if (HZ < USER_HZ && rt_expires > 0x7FFFFFFF / USER_HZ) + rt_expires = 0x7FFFFFFF / USER_HZ; if (pinfo->onlink) { struct rt6_info *rt; @@ -1610,12 +1635,12 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len) ip6_del_rt(rt, NULL, NULL, NULL); rt = NULL; } else { - rt->rt6i_expires = rt_expires; + rt->rt6i_expires = jiffies + rt_expires; } } } else if (valid_lft) { addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len, - dev, rt_expires, RTF_ADDRCONF|RTF_EXPIRES|RTF_PREFIX_RT); + dev, jiffies_to_clock_t(rt_expires), RTF_ADDRCONF|RTF_EXPIRES|RTF_PREFIX_RT); } if (rt) dst_release(&rt->u.dst); @@ -2125,9 +2150,42 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, { struct net_device *dev = (struct net_device *) data; struct inet6_dev *idev = __in6_dev_get(dev); + int run_pending = 0; switch(event) { case NETDEV_UP: + case NETDEV_CHANGE: + if (event == NETDEV_UP) { + if (!netif_carrier_ok(dev)) { + /* device is not ready yet. */ + printk(KERN_INFO + "ADDRCONF(NETDEV_UP): %s: " + "link is not ready\n", + dev->name); + break; + } + } else { + if (!netif_carrier_ok(dev)) { + /* device is still not ready. */ + break; + } + + if (idev) { + if (idev->if_flags & IF_READY) { + /* device is already configured. */ + break; + } + idev->if_flags |= IF_READY; + } + + printk(KERN_INFO + "ADDRCONF(NETDEV_CHANGE): %s: " + "link becomes ready\n", + dev->name); + + run_pending = 1; + } + switch(dev->type) { case ARPHRD_SIT: addrconf_sit_config(dev); @@ -2144,6 +2202,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, break; }; if (idev) { + if (run_pending) + addrconf_dad_run(idev); + /* If the MTU changed during the interface down, when the interface up, the changed MTU must be reflected in the idev as well as routers. @@ -2178,8 +2239,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, */ addrconf_ifdown(dev, event != NETDEV_DOWN); break; - case NETDEV_CHANGE: - break; + case NETDEV_CHANGENAME: #ifdef CONFIG_SYSCTL if (idev) { @@ -2260,7 +2320,7 @@ static int addrconf_ifdown(struct net_device *dev, int how) /* Step 3: clear flags for stateless addrconf */ if (how != 1) - idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD); + idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD|IF_READY); /* Step 4: clear address list */ #ifdef CONFIG_IPV6_PRIVACY @@ -2369,11 +2429,20 @@ out: /* * Duplicate Address Detection */ +static void addrconf_dad_kick(struct inet6_ifaddr *ifp) +{ + unsigned long rand_num; + struct inet6_dev *idev = ifp->idev; + + rand_num = net_random() % (idev->cnf.rtr_solicit_delay ? : 1); + ifp->probes = idev->cnf.dad_transmits; + addrconf_mod_timer(ifp, AC_DAD, rand_num); +} + static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags) { struct inet6_dev *idev = ifp->idev; struct net_device *dev = idev->dev; - unsigned long rand_num; addrconf_join_solict(dev, &ifp->addr); @@ -2382,7 +2451,6 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags) flags); net_srandom(ifp->addr.s6_addr32[3]); - rand_num = net_random() % (idev->cnf.rtr_solicit_delay ? : 1); read_lock_bh(&idev->lock); if (ifp->dead) @@ -2399,9 +2467,19 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags) return; } - ifp->probes = idev->cnf.dad_transmits; - addrconf_mod_timer(ifp, AC_DAD, rand_num); - + if (!(idev->if_flags & IF_READY)) { + spin_unlock_bh(&ifp->lock); + read_unlock_bh(&idev->lock); + /* + * If the defice is not ready: + * - keep it tentative if it is a permanent address. + * - otherwise, kill it. + */ + in6_ifa_hold(ifp); + addrconf_dad_stop(ifp); + return; + } + addrconf_dad_kick(ifp); spin_unlock_bh(&ifp->lock); out: read_unlock_bh(&idev->lock); @@ -2484,6 +2562,22 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp) } } +static void addrconf_dad_run(struct inet6_dev *idev) { + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + for (ifp = idev->addr_list; ifp; ifp = ifp->if_next) { + spin_lock_bh(&ifp->lock); + if (!(ifp->flags & IFA_F_TENTATIVE)) { + spin_unlock_bh(&ifp->lock); + continue; + } + spin_unlock_bh(&ifp->lock); + addrconf_dad_kick(ifp); + } + read_unlock_bh(&idev->lock); +} + #ifdef CONFIG_PROC_FS struct if6_iter_state { int bucket; @@ -2689,6 +2783,9 @@ restart: in6_ifa_hold(ifpub); spin_unlock(&ifp->lock); read_unlock(&addrconf_hash_lock); + spin_lock(&ifpub->lock); + ifpub->regen_count = 0; + spin_unlock(&ifpub->lock); ipv6_create_tempaddr(ifpub, ifp); in6_ifa_put(ifpub); in6_ifa_put(ifp); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 40d9a1935ab..8bfbe997079 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -248,7 +248,7 @@ static u32 esp6_get_max_size(struct xfrm_state *x, int mtu) if (esp->conf.padlen) mtu = ALIGN(mtu, esp->conf.padlen); - return mtu + x->props.header_len + esp->auth.icv_full_len; + return mtu + x->props.header_len + esp->auth.icv_trunc_len; } static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 34a332225c1..6ec6a2b549b 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -328,8 +328,10 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, iif = skb->dev->ifindex; /* - * Must not send if we know that source is Anycast also. - * for now we don't know that. + * Must not send error if the source does not uniquely + * identify a single node (RFC2463 Section 2.4). + * We check unspecified / multicast addresses here, + * and anycast addresses will be checked later. */ if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) { LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: addr_any/mcast source\n"); @@ -373,6 +375,16 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, err = ip6_dst_lookup(sk, &dst, &fl); if (err) goto out; + + /* + * We won't send icmp if the destination is known + * anycast. + */ + if (((struct rt6_info *)dst)->rt6i_flags & RTF_ANYCAST) { + LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: acast source\n"); + goto out_dst_release; + } + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) goto out; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index fd939da090c..f829a4ad3cc 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -170,7 +170,7 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, #define MLDV2_QQIC(value) MLDV2_EXP(0x80, 4, 3, value) #define MLDV2_MRC(value) MLDV2_EXP(0x8000, 12, 3, value) -#define IPV6_MLD_MAX_MSF 10 +#define IPV6_MLD_MAX_MSF 64 int sysctl_mld_max_msf = IPV6_MLD_MAX_MSF; @@ -224,6 +224,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr) mc_lst->ifindex = dev->ifindex; mc_lst->sfmode = MCAST_EXCLUDE; + mc_lst->sflock = RW_LOCK_UNLOCKED; mc_lst->sflist = NULL; /* @@ -360,6 +361,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, struct ip6_sf_socklist *psl; int i, j, rv; int leavegroup = 0; + int pmclocked = 0; int err; if (pgsr->gsr_group.ss_family != AF_INET6 || @@ -403,6 +405,9 @@ int ip6_mc_source(int add, int omode, struct sock *sk, pmc->sfmode = omode; } + write_lock_bh(&pmc->sflock); + pmclocked = 1; + psl = pmc->sflist; if (!add) { if (!psl) @@ -475,6 +480,8 @@ int ip6_mc_source(int add, int omode, struct sock *sk, /* update the interface list */ ip6_mc_add_src(idev, group, omode, 1, source, 1); done: + if (pmclocked) + write_unlock_bh(&pmc->sflock); read_unlock_bh(&ipv6_sk_mc_lock); read_unlock_bh(&idev->lock); in6_dev_put(idev); @@ -510,6 +517,8 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) dev = idev->dev; err = 0; + read_lock_bh(&ipv6_sk_mc_lock); + if (gsf->gf_fmode == MCAST_INCLUDE && gsf->gf_numsrc == 0) { leavegroup = 1; goto done; @@ -549,6 +558,8 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) newpsl = NULL; (void) ip6_mc_add_src(idev, group, gsf->gf_fmode, 0, NULL, 0); } + + write_lock_bh(&pmc->sflock); psl = pmc->sflist; if (psl) { (void) ip6_mc_del_src(idev, group, pmc->sfmode, @@ -558,8 +569,10 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) (void) ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); pmc->sflist = newpsl; pmc->sfmode = gsf->gf_fmode; + write_unlock_bh(&pmc->sflock); err = 0; done: + read_unlock_bh(&ipv6_sk_mc_lock); read_unlock_bh(&idev->lock); in6_dev_put(idev); dev_put(dev); @@ -592,6 +605,11 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, dev = idev->dev; err = -EADDRNOTAVAIL; + /* + * changes to the ipv6_mc_list require the socket lock and + * a read lock on ip6_sk_mc_lock. We have the socket lock, + * so reading the list is safe. + */ for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) { if (pmc->ifindex != gsf->gf_interface) @@ -614,6 +632,10 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) { return -EFAULT; } + /* changes to psl require the socket lock, a read lock on + * on ipv6_sk_mc_lock and a write lock on pmc->sflock. We + * have the socket lock, so reading here is safe. + */ for (i=0; i<copycount; i++) { struct sockaddr_in6 *psin6; struct sockaddr_storage ss; @@ -650,6 +672,7 @@ int inet6_mc_check(struct sock *sk, struct in6_addr *mc_addr, read_unlock(&ipv6_sk_mc_lock); return 1; } + read_lock(&mc->sflock); psl = mc->sflist; if (!psl) { rv = mc->sfmode == MCAST_EXCLUDE; @@ -665,6 +688,7 @@ int inet6_mc_check(struct sock *sk, struct in6_addr *mc_addr, if (mc->sfmode == MCAST_EXCLUDE && i < psl->sl_count) rv = 0; } + read_unlock(&mc->sflock); read_unlock(&ipv6_sk_mc_lock); return rv; @@ -1068,7 +1092,8 @@ static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime) ma->mca_flags |= MAF_TIMER_RUNNING; } -static void mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, +/* mark EXCLUDE-mode sources */ +static int mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs, struct in6_addr *srcs) { struct ip6_sf_list *psf; @@ -1078,13 +1103,53 @@ static void mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { if (scount == nsrcs) break; - for (i=0; i<nsrcs; i++) + for (i=0; i<nsrcs; i++) { + /* skip inactive filters */ + if (pmc->mca_sfcount[MCAST_INCLUDE] || + pmc->mca_sfcount[MCAST_EXCLUDE] != + psf->sf_count[MCAST_EXCLUDE]) + continue; + if (ipv6_addr_equal(&srcs[i], &psf->sf_addr)) { + scount++; + break; + } + } + } + pmc->mca_flags &= ~MAF_GSQUERY; + if (scount == nsrcs) /* all sources excluded */ + return 0; + return 1; +} + +static int mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, + struct in6_addr *srcs) +{ + struct ip6_sf_list *psf; + int i, scount; + + if (pmc->mca_sfmode == MCAST_EXCLUDE) + return mld_xmarksources(pmc, nsrcs, srcs); + + /* mark INCLUDE-mode sources */ + + scount = 0; + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + if (scount == nsrcs) + break; + for (i=0; i<nsrcs; i++) { if (ipv6_addr_equal(&srcs[i], &psf->sf_addr)) { psf->sf_gsresp = 1; scount++; break; } + } + } + if (!scount) { + pmc->mca_flags &= ~MAF_GSQUERY; + return 0; } + pmc->mca_flags |= MAF_GSQUERY; + return 1; } int igmp6_event_query(struct sk_buff *skb) @@ -1167,7 +1232,7 @@ int igmp6_event_query(struct sk_buff *skb) /* mark sources to include, if group & source-specific */ if (mlh2->nsrcs != 0) { if (!pskb_may_pull(skb, srcs_offset + - mlh2->nsrcs * sizeof(struct in6_addr))) { + ntohs(mlh2->nsrcs) * sizeof(struct in6_addr))) { in6_dev_put(idev); return -EINVAL; } @@ -1203,10 +1268,9 @@ int igmp6_event_query(struct sk_buff *skb) else ma->mca_flags &= ~MAF_GSQUERY; } - if (ma->mca_flags & MAF_GSQUERY) - mld_marksources(ma, ntohs(mlh2->nsrcs), - mlh2->srcs); - igmp6_group_queried(ma, max_delay); + if (!(ma->mca_flags & MAF_GSQUERY) || + mld_marksources(ma, ntohs(mlh2->nsrcs), mlh2->srcs)) + igmp6_group_queried(ma, max_delay); spin_unlock_bh(&ma->mca_lock); if (group_type != IPV6_ADDR_ANY) break; @@ -1281,7 +1345,18 @@ static int is_in(struct ifmcaddr6 *pmc, struct ip6_sf_list *psf, int type, case MLD2_MODE_IS_EXCLUDE: if (gdeleted || sdeleted) return 0; - return !((pmc->mca_flags & MAF_GSQUERY) && !psf->sf_gsresp); + if (!((pmc->mca_flags & MAF_GSQUERY) && !psf->sf_gsresp)) { + if (pmc->mca_sfmode == MCAST_INCLUDE) + return 1; + /* don't include if this source is excluded + * in all filters + */ + if (psf->sf_count[MCAST_INCLUDE]) + return 0; + return pmc->mca_sfcount[MCAST_EXCLUDE] == + psf->sf_count[MCAST_EXCLUDE]; + } + return 0; case MLD2_CHANGE_TO_INCLUDE: if (gdeleted || sdeleted) return 0; @@ -1450,7 +1525,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, struct mld2_report *pmr; struct mld2_grec *pgr = NULL; struct ip6_sf_list *psf, *psf_next, *psf_prev, **psf_list; - int scount, first, isquery, truncate; + int scount, stotal, first, isquery, truncate; if (pmc->mca_flags & MAF_NOREPORT) return skb; @@ -1460,25 +1535,13 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, truncate = type == MLD2_MODE_IS_EXCLUDE || type == MLD2_CHANGE_TO_EXCLUDE; + stotal = scount = 0; + psf_list = sdeleted ? &pmc->mca_tomb : &pmc->mca_sources; - if (!*psf_list) { - if (type == MLD2_ALLOW_NEW_SOURCES || - type == MLD2_BLOCK_OLD_SOURCES) - return skb; - if (pmc->mca_crcount || isquery) { - /* make sure we have room for group header and at - * least one source. - */ - if (skb && AVAILABLE(skb) < sizeof(struct mld2_grec)+ - sizeof(struct in6_addr)) { - mld_sendpack(skb); - skb = NULL; /* add_grhead will get a new one */ - } - skb = add_grhead(skb, pmc, type, &pgr); - } - return skb; - } + if (!*psf_list) + goto empty_source; + pmr = skb ? (struct mld2_report *)skb->h.raw : NULL; /* EX and TO_EX get a fresh packet, if needed */ @@ -1491,7 +1554,6 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, } } first = 1; - scount = 0; psf_prev = NULL; for (psf=*psf_list; psf; psf=psf_next) { struct in6_addr *psrc; @@ -1525,7 +1587,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, } psrc = (struct in6_addr *)skb_put(skb, sizeof(*psrc)); *psrc = psf->sf_addr; - scount++; + scount++; stotal++; if ((type == MLD2_ALLOW_NEW_SOURCES || type == MLD2_BLOCK_OLD_SOURCES) && psf->sf_crcount) { psf->sf_crcount--; @@ -1540,6 +1602,21 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, } psf_prev = psf; } + +empty_source: + if (!stotal) { + if (type == MLD2_ALLOW_NEW_SOURCES || + type == MLD2_BLOCK_OLD_SOURCES) + return skb; + if (pmc->mca_crcount || isquery) { + /* make sure we have room for group header */ + if (skb && AVAILABLE(skb) < sizeof(struct mld2_grec)) { + mld_sendpack(skb); + skb = NULL; /* add_grhead will get a new one */ + } + skb = add_grhead(skb, pmc, type, &pgr); + } + } if (pgr) pgr->grec_nsrcs = htons(scount); @@ -1621,11 +1698,11 @@ static void mld_send_cr(struct inet6_dev *idev) skb = add_grec(skb, pmc, dtype, 1, 1); } if (pmc->mca_crcount) { - pmc->mca_crcount--; if (pmc->mca_sfmode == MCAST_EXCLUDE) { type = MLD2_CHANGE_TO_INCLUDE; skb = add_grec(skb, pmc, type, 1, 0); } + pmc->mca_crcount--; if (pmc->mca_crcount == 0) { mld_clear_zeros(&pmc->mca_tomb); mld_clear_zeros(&pmc->mca_sources); @@ -1659,12 +1736,12 @@ static void mld_send_cr(struct inet6_dev *idev) /* filter mode changes */ if (pmc->mca_crcount) { - pmc->mca_crcount--; if (pmc->mca_sfmode == MCAST_EXCLUDE) type = MLD2_CHANGE_TO_EXCLUDE; else type = MLD2_CHANGE_TO_INCLUDE; skb = add_grec(skb, pmc, type, 0, 0); + pmc->mca_crcount--; } spin_unlock_bh(&pmc->mca_lock); } @@ -2023,6 +2100,9 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, { int err; + /* callers have the socket lock and a write lock on ipv6_sk_mc_lock, + * so no other readers or writers of iml or its sflist + */ if (iml->sflist == 0) { /* any-source empty exclude case */ return ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0); diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 060d6120241..04912f9b35c 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -211,7 +211,7 @@ config IP6_NF_TARGET_REJECT config IP6_NF_TARGET_NFQUEUE tristate "NFQUEUE Target Support" - depends on IP_NF_IPTABLES + depends on IP6_NF_IPTABLES help This Target replaced the old obsolete QUEUE target. diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index c0f1da5497a..a7e03cfacd0 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -68,8 +68,8 @@ static int icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple, [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_REPLY +1 }; - __u8 type = orig->dst.u.icmp.type - 128; - if (type >= sizeof(invmap) || !invmap[type]) + int type = orig->dst.u.icmp.type - 128; + if (type < 0 || type >= sizeof(invmap) || !invmap[type]) return 0; tuple->src.u.icmp.id = orig->src.u.icmp.id; @@ -129,12 +129,12 @@ static int icmpv6_new(struct nf_conn *conntrack, [ICMPV6_ECHO_REQUEST - 128] = 1, [ICMPV6_NI_QUERY - 128] = 1 }; + int type = conntrack->tuplehash[0].tuple.dst.u.icmp.type - 128; - if (conntrack->tuplehash[0].tuple.dst.u.icmp.type - 128 >= sizeof(valid_new) - || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type - 128]) { + if (type < 0 || type >= sizeof(valid_new) || !valid_new[type]) { /* Can't create a new ICMPv6 `conn' with this. */ - DEBUGP("icmp: can't create new conn with type %u\n", - conntrack->tuplehash[0].tuple.dst.u.icmp.type); + DEBUGP("icmpv6: can't create new conn with type %u\n", + type + 128); NF_CT_DUMP_TUPLE(&conntrack->tuplehash[0].tuple); return 0; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a7a537b5059..66140f13d11 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -413,11 +413,14 @@ static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr, rt = ip6_rt_copy(ort); if (rt) { - ipv6_addr_copy(&rt->rt6i_dst.addr, daddr); - - if (!(rt->rt6i_flags&RTF_GATEWAY)) + if (!(rt->rt6i_flags&RTF_GATEWAY)) { + if (rt->rt6i_dst.plen != 128 && + ipv6_addr_equal(&rt->rt6i_dst.addr, daddr)) + rt->rt6i_flags |= RTF_ANYCAST; ipv6_addr_copy(&rt->rt6i_gateway, daddr); + } + ipv6_addr_copy(&rt->rt6i_dst.addr, daddr); rt->rt6i_dst.plen = 128; rt->rt6i_flags |= RTF_CACHE; rt->u.dst.flags |= DST_HOST; @@ -829,7 +832,7 @@ int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, } rt->u.dst.obsolete = -1; - rt->rt6i_expires = clock_t_to_jiffies(rtmsg->rtmsg_info); + rt->rt6i_expires = jiffies + clock_t_to_jiffies(rtmsg->rtmsg_info); if (nlh && (r = NLMSG_DATA(nlh))) { rt->rt6i_protocol = r->rtm_protocol; } else { @@ -1413,7 +1416,9 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, rt->u.dst.obsolete = -1; rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; - if (!anycast) + if (anycast) + rt->rt6i_flags |= RTF_ANYCAST; + else rt->rt6i_flags |= RTF_LOCAL; rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway); if (rt->rt6i_nexthop == NULL) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 62c0e5bd931..8827389abaf 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -992,13 +992,12 @@ static void tcp_v6_send_reset(struct sk_buff *skb) /* sk = NULL, but it is safe for now. RST socket required. */ if (!ip6_dst_lookup(NULL, &buff->dst, &fl)) { - if ((xfrm_lookup(&buff->dst, &fl, NULL, 0)) < 0) + if (xfrm_lookup(&buff->dst, &fl, NULL, 0) >= 0) { + ip6_xmit(NULL, buff, &fl, NULL, 0); + TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); + TCP_INC_STATS_BH(TCP_MIB_OUTRSTS); return; - - ip6_xmit(NULL, buff, &fl, NULL, 0); - TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); - TCP_INC_STATS_BH(TCP_MIB_OUTRSTS); - return; + } } kfree_skb(buff); @@ -1057,11 +1056,11 @@ static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 fl.fl_ip_sport = t1->source; if (!ip6_dst_lookup(NULL, &buff->dst, &fl)) { - if ((xfrm_lookup(&buff->dst, &fl, NULL, 0)) < 0) + if (xfrm_lookup(&buff->dst, &fl, NULL, 0) >= 0) { + ip6_xmit(NULL, buff, &fl, NULL, 0); + TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); return; - ip6_xmit(NULL, buff, &fl, NULL, 0); - TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); - return; + } } kfree_skb(buff); diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index cf1d91e74c8..69bd957380e 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -214,6 +214,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl) case IPPROTO_UDP: case IPPROTO_TCP: case IPPROTO_SCTP: + case IPPROTO_DCCP: if (pskb_may_pull(skb, skb->nh.raw + offset + 4 - skb->data)) { u16 *ports = (u16 *)exthdr; diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index a84f9221e5f..794c41d19b2 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -61,8 +61,8 @@ config NF_CONNTRACK_MARK instead of the individual packets. config NF_CONNTRACK_EVENTS - bool "Connection tracking events" - depends on NF_CONNTRACK + bool "Connection tracking events (EXPERIMENTAL)" + depends on EXPERIMENTAL && NF_CONNTRACK help If this option is enabled, the connection tracking code will provide a notifier chain that can be used by other kernel code diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 1da678303d7..a7c7b490cf2 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1383,6 +1383,9 @@ void nf_conntrack_cleanup(void) schedule(); goto i_see_dead_people; } + /* wait until all references to nf_conntrack_untracked are dropped */ + while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1) + schedule(); for (i = 0; i < NF_CT_F_NUM; i++) { if (nf_ct_cache[i].use == 0) diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index a60c59b9763..95fdf04f1d8 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -162,7 +162,7 @@ nfnetlink_check_attributes(struct nfnetlink_subsystem *subsys, return -EINVAL; } - min_len = NLMSG_ALIGN(sizeof(struct nfgenmsg)); + min_len = NLMSG_SPACE(sizeof(struct nfgenmsg)); if (unlikely(nlh->nlmsg_len < min_len)) return -EINVAL; @@ -236,8 +236,7 @@ static inline int nfnetlink_rcv_msg(struct sk_buff *skb, } /* All the messages must at least contain nfgenmsg */ - if (nlh->nlmsg_len < - NLMSG_LENGTH(NLMSG_ALIGN(sizeof(struct nfgenmsg)))) { + if (nlh->nlmsg_len < NLMSG_SPACE(sizeof(struct nfgenmsg))) { DEBUGP("received message was too short\n"); return 0; } diff --git a/net/netrom/nr_in.c b/net/netrom/nr_in.c index 004e8599b8f..a7d88b5ad75 100644 --- a/net/netrom/nr_in.c +++ b/net/netrom/nr_in.c @@ -99,7 +99,7 @@ static int nr_state1_machine(struct sock *sk, struct sk_buff *skb, break; case NR_RESET: - if (sysctl_netrom_reset_circuit); + if (sysctl_netrom_reset_circuit) nr_disconnect(sk, ECONNRESET); break; @@ -130,7 +130,7 @@ static int nr_state2_machine(struct sock *sk, struct sk_buff *skb, break; case NR_RESET: - if (sysctl_netrom_reset_circuit); + if (sysctl_netrom_reset_circuit) nr_disconnect(sk, ECONNRESET); break; @@ -265,7 +265,7 @@ static int nr_state3_machine(struct sock *sk, struct sk_buff *skb, int frametype break; case NR_RESET: - if (sysctl_netrom_reset_circuit); + if (sysctl_netrom_reset_circuit) nr_disconnect(sk, ECONNRESET); break; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 499ae3df4a4..3e246276041 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1587,23 +1587,47 @@ static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order) return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1); } -static void free_pg_vec(char **pg_vec, unsigned order, unsigned len) +static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len) { int i; - for (i=0; i<len; i++) { - if (pg_vec[i]) { - struct page *page, *pend; - - pend = pg_vec_endpage(pg_vec[i], order); - for (page = virt_to_page(pg_vec[i]); page <= pend; page++) - ClearPageReserved(page); - free_pages((unsigned long)pg_vec[i], order); - } + for (i = 0; i < len; i++) { + if (likely(pg_vec[i])) + free_pages((unsigned long) pg_vec[i], order); } kfree(pg_vec); } +static inline char *alloc_one_pg_vec_page(unsigned long order) +{ + return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO, + order); +} + +static char **alloc_pg_vec(struct tpacket_req *req, int order) +{ + unsigned int block_nr = req->tp_block_nr; + char **pg_vec; + int i; + + pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL); + if (unlikely(!pg_vec)) + goto out; + + for (i = 0; i < block_nr; i++) { + pg_vec[i] = alloc_one_pg_vec_page(order); + if (unlikely(!pg_vec[i])) + goto out_free_pgvec; + } + +out: + return pg_vec; + +out_free_pgvec: + free_pg_vec(pg_vec, order, block_nr); + pg_vec = NULL; + goto out; +} static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing) { @@ -1617,64 +1641,46 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing /* Sanity tests and some calculations */ - if (po->pg_vec) + if (unlikely(po->pg_vec)) return -EBUSY; - if ((int)req->tp_block_size <= 0) + if (unlikely((int)req->tp_block_size <= 0)) return -EINVAL; - if (req->tp_block_size&(PAGE_SIZE-1)) + if (unlikely(req->tp_block_size & (PAGE_SIZE - 1))) return -EINVAL; - if (req->tp_frame_size < TPACKET_HDRLEN) + if (unlikely(req->tp_frame_size < TPACKET_HDRLEN)) return -EINVAL; - if (req->tp_frame_size&(TPACKET_ALIGNMENT-1)) + if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1))) return -EINVAL; po->frames_per_block = req->tp_block_size/req->tp_frame_size; - if (po->frames_per_block <= 0) + if (unlikely(po->frames_per_block <= 0)) return -EINVAL; - if (po->frames_per_block*req->tp_block_nr != req->tp_frame_nr) + if (unlikely((po->frames_per_block * req->tp_block_nr) != + req->tp_frame_nr)) return -EINVAL; - /* OK! */ - - /* Allocate page vector */ - while ((PAGE_SIZE<<order) < req->tp_block_size) - order++; err = -ENOMEM; - - pg_vec = kmalloc(req->tp_block_nr*sizeof(char *), GFP_KERNEL); - if (pg_vec == NULL) + order = get_order(req->tp_block_size); + pg_vec = alloc_pg_vec(req, order); + if (unlikely(!pg_vec)) goto out; - memset(pg_vec, 0, req->tp_block_nr*sizeof(char **)); - - for (i=0; i<req->tp_block_nr; i++) { - struct page *page, *pend; - pg_vec[i] = (char *)__get_free_pages(GFP_KERNEL, order); - if (!pg_vec[i]) - goto out_free_pgvec; - - pend = pg_vec_endpage(pg_vec[i], order); - for (page = virt_to_page(pg_vec[i]); page <= pend; page++) - SetPageReserved(page); - } - /* Page vector is allocated */ l = 0; - for (i=0; i<req->tp_block_nr; i++) { + for (i = 0; i < req->tp_block_nr; i++) { char *ptr = pg_vec[i]; struct tpacket_hdr *header; int k; - for (k=0; k<po->frames_per_block; k++) { - - header = (struct tpacket_hdr*)ptr; + for (k = 0; k < po->frames_per_block; k++) { + header = (struct tpacket_hdr *) ptr; header->tp_status = TP_STATUS_KERNEL; ptr += req->tp_frame_size; } } /* Done */ } else { - if (req->tp_frame_nr) + if (unlikely(req->tp_frame_nr)) return -EINVAL; } @@ -1701,7 +1707,7 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing spin_lock_bh(&sk->sk_receive_queue.lock); pg_vec = XC(po->pg_vec, pg_vec); - po->frame_max = req->tp_frame_nr-1; + po->frame_max = (req->tp_frame_nr - 1); po->head = 0; po->frame_size = req->tp_frame_size; spin_unlock_bh(&sk->sk_receive_queue.lock); @@ -1728,7 +1734,6 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing release_sock(sk); -out_free_pgvec: if (pg_vec) free_pg_vec(pg_vec, order, req->tp_block_nr); out: @@ -1755,17 +1760,19 @@ static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_st if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE) goto out; - atomic_inc(&po->mapped); start = vma->vm_start; - err = -EAGAIN; - for (i=0; i<po->pg_vec_len; i++) { - if (remap_pfn_range(vma, start, - __pa(po->pg_vec[i]) >> PAGE_SHIFT, - po->pg_vec_pages*PAGE_SIZE, - vma->vm_page_prot)) - goto out; - start += po->pg_vec_pages*PAGE_SIZE; + for (i = 0; i < po->pg_vec_len; i++) { + struct page *page = virt_to_page(po->pg_vec[i]); + int pg_num; + + for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) { + err = vm_insert_page(vma, start, page); + if (unlikely(err)) + goto out; + start += PAGE_SIZE; + } } + atomic_inc(&po->mapped); vma->vm_ops = &packet_mmap_ops; err = 0; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 8aebe8f6d27..2ce1cb2aa2e 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -34,7 +34,7 @@ #include <net/sch_generic.h> #include <net/act_api.h> -#if 1 /* control */ +#if 0 /* control */ #define DPRINTK(format, args...) printk(KERN_DEBUG format, ##args) #else #define DPRINTK(format, args...) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index d890dfa8818..9df888e932c 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -156,10 +156,6 @@ static inline void sctp_set_owner_w(struct sctp_chunk *chunk) sizeof(struct sk_buff) + sizeof(struct sctp_chunk); - sk->sk_wmem_queued += SCTP_DATA_SNDSIZE(chunk) + - sizeof(struct sk_buff) + - sizeof(struct sctp_chunk); - atomic_add(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc); } @@ -3425,7 +3421,7 @@ static int sctp_copy_laddrs_to_user_old(struct sock *sk, __u16 port, int max_add } static int sctp_copy_laddrs_to_user(struct sock *sk, __u16 port, - void * __user *to, size_t space_left) + void __user **to, size_t space_left) { struct list_head *pos; struct sctp_sockaddr_entry *addr; @@ -4426,7 +4422,7 @@ cleanup: * tcp_poll(). Note that, based on these implementations, we don't * lock the socket in this function, even though it seems that, * ideally, locking or some other mechanisms can be used to ensure - * the integrity of the counters (sndbuf and wmem_queued) used + * the integrity of the counters (sndbuf and wmem_alloc) used * in this place. We assume that we don't need locks either until proven * otherwise. * @@ -4833,10 +4829,6 @@ static void sctp_wfree(struct sk_buff *skb) sizeof(struct sk_buff) + sizeof(struct sctp_chunk); - sk->sk_wmem_queued -= SCTP_DATA_SNDSIZE(chunk) + - sizeof(struct sk_buff) + - sizeof(struct sctp_chunk); - atomic_sub(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc); sock_wfree(skb); @@ -4920,7 +4912,7 @@ void sctp_write_space(struct sock *sk) /* Is there any sndbuf space available on the socket? * - * Note that wmem_queued is the sum of the send buffers on all of the + * Note that sk_wmem_alloc is the sum of the send buffers on all of the * associations on the same socket. For a UDP-style socket with * multiple associations, it is possible for it to be "unwriteable" * prematurely. I assume that this is acceptable because @@ -4933,7 +4925,7 @@ static int sctp_writeable(struct sock *sk) { int amt = 0; - amt = sk->sk_sndbuf - sk->sk_wmem_queued; + amt = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc); if (amt < 0) amt = 0; return amt; diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index f44f46f1d8e..8d782282ec1 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -638,7 +638,7 @@ gss_pipe_destroy_msg(struct rpc_pipe_msg *msg) gss_msg); atomic_inc(&gss_msg->count); gss_unhash_msg(gss_msg); - if (msg->errno == -ETIMEDOUT || msg->errno == -EPIPE) { + if (msg->errno == -ETIMEDOUT) { unsigned long now = jiffies; if (time_after(now, ratelimit)) { printk(KERN_WARNING "RPC: AUTH_GSS upcall timed out.\n" @@ -786,7 +786,9 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int taskflags) cred->gc_flags = 0; cred->gc_base.cr_ops = &gss_credops; cred->gc_service = gss_auth->service; - err = gss_create_upcall(gss_auth, cred); + do { + err = gss_create_upcall(gss_auth, cred); + } while (err == -EAGAIN); if (err < 0) goto out_err; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index c76ea221798..16a2458f38f 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -174,7 +174,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp) goto out; msg = (struct rpc_pipe_msg *)filp->private_data; if (msg != NULL) { - msg->errno = -EPIPE; + msg->errno = -EAGAIN; list_del_init(&msg->list); rpci->ops->destroy_msg(msg); } @@ -183,7 +183,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp) if (filp->f_mode & FMODE_READ) rpci->nreaders --; if (!rpci->nreaders) - __rpc_purge_upcall(inode, -EPIPE); + __rpc_purge_upcall(inode, -EAGAIN); if (rpci->ops->release_pipe) rpci->ops->release_pipe(inode); out: diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 0a51fd46a84..77e8800d412 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -990,6 +990,7 @@ static void xs_udp_connect_worker(void *args) sk->sk_data_ready = xs_udp_data_ready; sk->sk_write_space = xs_udp_write_space; sk->sk_no_check = UDP_CSUM_NORCV; + sk->sk_allocation = GFP_ATOMIC; xprt_set_connected(xprt); @@ -1074,6 +1075,7 @@ static void xs_tcp_connect_worker(void *args) sk->sk_data_ready = xs_tcp_data_ready; sk->sk_state_change = xs_tcp_state_change; sk->sk_write_space = xs_tcp_write_space; + sk->sk_allocation = GFP_ATOMIC; /* socket options */ sk->sk_userlocks |= SOCK_BINDPORT_LOCK; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 0db9e57013f..d19e274b9c4 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -346,6 +346,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) struct xfrm_policy *pol, **p; struct xfrm_policy *delpol = NULL; struct xfrm_policy **newpos = NULL; + struct dst_entry *gc_list; write_lock_bh(&xfrm_policy_lock); for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL;) { @@ -381,9 +382,36 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) xfrm_pol_hold(policy); write_unlock_bh(&xfrm_policy_lock); - if (delpol) { + if (delpol) xfrm_policy_kill(delpol); + + read_lock_bh(&xfrm_policy_lock); + gc_list = NULL; + for (policy = policy->next; policy; policy = policy->next) { + struct dst_entry *dst; + + write_lock(&policy->lock); + dst = policy->bundles; + if (dst) { + struct dst_entry *tail = dst; + while (tail->next) + tail = tail->next; + tail->next = gc_list; + gc_list = dst; + + policy->bundles = NULL; + } + write_unlock(&policy->lock); + } + read_unlock_bh(&xfrm_policy_lock); + + while (gc_list) { + struct dst_entry *dst = gc_list; + + gc_list = dst->next; + dst_free(dst); } + return 0; } EXPORT_SYMBOL(xfrm_policy_insert); @@ -1014,13 +1042,12 @@ int __xfrm_route_forward(struct sk_buff *skb, unsigned short family) } EXPORT_SYMBOL(__xfrm_route_forward); -/* Optimize later using cookies and generation ids. */ - static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie) { - if (!stale_bundle(dst)) - return dst; - + /* If it is marked obsolete, which is how we even get here, + * then we have purged it from the policy bundle list and we + * did that for a good reason. + */ return NULL; } @@ -1104,6 +1131,16 @@ int xfrm_flush_bundles(void) return 0; } +static int always_true(struct dst_entry *dst) +{ + return 1; +} + +void xfrm_flush_all_bundles(void) +{ + xfrm_prune_bundles(always_true); +} + void xfrm_init_pmtu(struct dst_entry *dst) { do { diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 7cf48aa6c95..479effc9766 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -431,6 +431,8 @@ void xfrm_state_insert(struct xfrm_state *x) spin_lock_bh(&xfrm_state_lock); __xfrm_state_insert(x); spin_unlock_bh(&xfrm_state_lock); + + xfrm_flush_all_bundles(); } EXPORT_SYMBOL(xfrm_state_insert); @@ -478,6 +480,9 @@ out: spin_unlock_bh(&xfrm_state_lock); xfrm_state_put_afinfo(afinfo); + if (!err) + xfrm_flush_all_bundles(); + if (x1) { xfrm_state_delete(x1); xfrm_state_put(x1); |